From 9b35b43493daa89c0cfce4df431d6054392c13df Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 30 Apr 2024 11:25:20 +0200 Subject: [PATCH 01/60] Switch DCAT backend to not use one job for each dataset --- udata/core/dataset/rdf.py | 1 + udata/harvest/backends/base.py | 133 +++++++++++++++++++++++++++++++++ udata/harvest/backends/dcat.py | 100 +++++++++++++++---------- 3 files changed, 196 insertions(+), 38 deletions(-) diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index 793cd0c4ef..4d782507c3 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -560,6 +560,7 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None): dataset.description = sanitize_html(description) dataset.frequency = frequency_from_rdf(d.value(DCT.accrualPeriodicity)) dataset.contact_point = contact_point_from_rdf(d, dataset) or dataset.contact_point + print(dataset.contact_point) schema = schema_from_rdf(d) if schema: dataset.schema = schema diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 826991a314..7180db0db2 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -2,6 +2,7 @@ import traceback from datetime import datetime, date, timedelta +from typing import Optional from uuid import UUID import requests @@ -353,3 +354,135 @@ def validate(self, data, schema): errors.append(msg) msg = '\n- '.join(['Validation error:'] + errors) raise HarvestValidationError(msg) + + +class BaseSyncBackend(BaseBackend): + """ + Parent class that wrap children methods to add error management and debug logs. + + The flow is the following: + Parent Child + + harvest -> inner_harvest() + / + process_dataset <------ + \ + --------> inner_process_dataset() + + """ + + def inner_harvest(self): + raise NotImplementedError + + def inner_process_dataset(self, dataset: Optional[Dataset]): + raise NotImplementedError + + def harvest(self): + log.debug(f'Starting harvesting f{self.source.name} (f{self.source.url})…') + factory = HarvestJob if self.dryrun else HarvestJob.objects.create + self.job = factory(status='initialized', + started=datetime.utcnow(), + source=self.source) + + before_harvest_job.send(self) + + try: + self.inner_harvest() + self.job.status = 'done' + except HarvestValidationError as e: + log.info(f'Harvesting validation failed for "{safe_unicode(self.source.name)}" (f{self.source.backend})') + + self.job.status = 'failed' + + error = HarvestError(message=safe_unicode(e)) + self.job.errors.append(error) + except Exception as e: + log.exception(f'Harvesting failed for "{safe_unicode(self.source.name)}" (f{self.source.backend})') + + self.job.status = 'failed' + + error = HarvestError(message=safe_unicode(e)) + self.job.errors.append(error) + finally: + self.end_job() + + + def process_dataset(self, remote_id: str, debug_data: dict, **kwargs): + log.debug(f'Processing dataset f{remote_id}…') + + # TODO add `type` to `HarvestItem` to differentiate `Dataset` from `Dataservice` + item = HarvestItem(status='started', started=datetime.utcnow(), remote_id=remote_id, kwargs=debug_data) + self.job.items.append(item) + self.save_job() + + try: + dataset = Dataset.objects(__raw__={ + 'harvest.remote_id': remote_id, + '$or': [ + {'harvest.domain': self.source.domain}, + {'harvest.source_id': str(self.source.id)}, + ], + }).first() + + dataset = self.inner_process_dataset(dataset, **kwargs) + + if not dataset.harvest: + dataset.harvest = HarvestDatasetMetadata() + dataset.harvest.domain = self.source.domain + dataset.harvest.remote_id = item.remote_id + dataset.harvest.source_id = str(self.source.id) + dataset.harvest.last_update = datetime.utcnow() + dataset.harvest.backend = self.display_name + + # unset archived status if needed + if dataset.harvest: + dataset.harvest.archived_at = None + dataset.harvest.archived = None + dataset.archived = None + + # TODO permissions checking + if not dataset.organization and not dataset.owner: + if self.source.organization: + dataset.organization = self.source.organization + elif self.source.owner: + dataset.owner = self.source.owner + + # TODO: Apply editable mappings + + if self.dryrun: + dataset.validate() + else: + dataset.save() + item.dataset = dataset + item.status = 'done' + except HarvestSkipException as e: + log.info('Skipped item %s : %s', item.remote_id, safe_unicode(e)) + item.status = 'skipped' + item.errors.append(HarvestError(message=safe_unicode(e))) + except HarvestValidationError as e: + log.info('Error validating item %s : %s', item.remote_id, safe_unicode(e)) + item.status = 'failed' + item.errors.append(HarvestError(message=safe_unicode(e))) + except Exception as e: + log.exception('Error while processing %s : %s', + item.remote_id, + safe_unicode(e)) + error = HarvestError(message=safe_unicode(e), + details=traceback.format_exc()) + item.errors.append(error) + item.status = 'failed' + + item.ended = datetime.utcnow() + self.save_job() + + + def save_job(self): + if not self.dryrun: + self.job.save() + + def end_job(self): + self.job.ended = datetime.utcnow() + if not self.dryrun: + self.job.save() + + after_harvest_job.send(self) \ No newline at end of file diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index 4c835389af..ddd78f738d 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -9,13 +9,14 @@ import json from typing import List +from udata.core.dataset.models import Dataset from udata.rdf import ( DCAT, DCT, HYDRA, SPDX, namespace_manager, guess_format, url_from_rdf ) from udata.core.dataset.rdf import dataset_from_rdf from udata.storage.s3 import store_as_json, get_from_json -from .base import BaseBackend +from .base import BaseBackend, BaseSyncBackend log = logging.getLogger(__name__) @@ -56,16 +57,21 @@ def extract_graph(source, target, node, specs): extract_graph(source, target, o, specs[p]) -class DcatBackend(BaseBackend): +class DcatBackend(BaseSyncBackend): display_name = 'DCAT' - def initialize(self): - '''List all datasets for a given ...''' + def inner_harvest(self): fmt = self.get_format() - graphs = self.parse_graph(self.source.url, fmt) - self.job.data = { 'format': fmt } + graphs = self.walk_graph( + self.source.url, + fmt, + lambda page_number, page: self.process_datasets(page_number, page), + ) + + # TODO call `walk_graph` with `process_dataservices` + serialized_graphs = [graph.serialize(format=fmt, indent=None) for graph in graphs] # The official MongoDB document size in 16MB. The default value here is 15MB to account for other fields in the document (and for difference between * 1024 vs * 1000). @@ -105,13 +111,15 @@ def get_format(self): raise ValueError(msg) return fmt - def parse_graph(self, url, fmt) -> List[Graph]: + def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: """ - Returns an instance of rdflib.Graph for each detected page - The index in the list is the page number + Process the graphs by executing the `do()` callback on each page. + + Returns all the pages in an array (index is the page number, value is + the rdflib.Graph of the page) for debug purposes (saved in `HarvestJob`) """ graphs = [] - page = 0 + page_number = 0 while url: subgraph = Graph(namespace_manager=namespace_manager) response = self.get(url) @@ -130,17 +138,24 @@ def parse_graph(self, url, fmt) -> List[Graph]: break graphs.append(subgraph) - for node in subgraph.subjects(RDF.type, DCAT.Dataset): - id = subgraph.value(node, DCT.identifier) - kwargs = {'page': page} - self.add_item(id, **kwargs) - if self.max_items and len(self.job.items) >= self.max_items: - # this will stop iterating on pagination - url = None + should_stop = do(page_number, subgraph) + if should_stop: + return - page += 1 + page_number += 1 return graphs + + def process_datasets(self, page_number, page): + for node in page.subjects(RDF.type, DCAT.Dataset): + remote_id = page.value(node, DCT.identifier) + should_stop = self.process_dataset(remote_id, debug_data = {'page_number': page_number}, page=page, node=node) + + if should_stop: + return True + + def inner_process_dataset(self, dataset: Dataset, page, node): + return dataset_from_rdf(page, dataset, node=node) def get_node_from_item(self, graph, item): for node in graph.subjects(RDF.type, DCAT.Dataset): @@ -209,7 +224,13 @@ class CswDcatBackend(DcatBackend): DCAT_SCHEMA = 'http://www.w3.org/ns/dcat#' - def parse_graph(self, url: str, fmt: str) -> List[Graph]: + def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: + """ + Process the graphs by executing the `do()` callback on each page. + + Returns all the pages in an array (index is the page number, value is + the rdflib.Graph of the page) for debug purposes (saved in `HarvestJob`) + """ body = ''' List[Graph]: headers = {'Content-Type': 'application/xml'} graphs = [] - page = 0 + page_number = 0 start = 1 response = self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA), @@ -242,18 +263,17 @@ def parse_graph(self, url: str, fmt: str) -> List[Graph]: graph = Graph(namespace_manager=namespace_manager) search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE}) if search_results is None: - log.error(f'No search results found for {url} on page {page}') + log.error(f'No search results found for {url} on page {page_number}') break for child in search_results: subgraph = Graph(namespace_manager=namespace_manager) subgraph.parse(data=ET.tostring(child), format=fmt) graph += subgraph - for node in subgraph.subjects(RDF.type, DCAT.Dataset): - id = subgraph.value(node, DCT.identifier) - kwargs = {'nid': str(node), 'page': page} - kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank' - self.add_item(id, **kwargs) + should_stop = do(page_number, subgraph) + if should_stop: + return + graphs.append(graph) next_record = self.next_record_if_should_continue(start, search_results) @@ -261,7 +281,7 @@ def parse_graph(self, url: str, fmt: str) -> List[Graph]: break start = next_record - page += 1 + page_number += 1 tree = ET.fromstring( self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA), @@ -283,12 +303,17 @@ class CswIso19139DcatBackend(DcatBackend): XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl" - def parse_graph(self, url: str, fmt: str) -> List[Graph]: - ''' + def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: + """ + Process the graphs by executing the `do()` callback on each page. + + Returns all the pages in an array (index is the page number, value is + the rdflib.Graph of the page) for debug purposes (saved in `HarvestJob`) + Parse CSW graph querying ISO schema. Use SEMIC GeoDCAT-AP XSLT to map it to a correct version. See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT. - ''' + """ # Load XSLT xsl = ET.fromstring(self.get(self.XSL_URL).content) @@ -315,7 +340,7 @@ def parse_graph(self, url: str, fmt: str) -> List[Graph]: headers = {'Content-Type': 'application/xml'} graphs = [] - page = 0 + page_number = 0 start = 1 response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA), @@ -332,7 +357,7 @@ def parse_graph(self, url: str, fmt: str) -> List[Graph]: # infos (useful for pagination) search_results = tree_before_transform.find('csw:SearchResults', {'csw': CSW_NAMESPACE}) if search_results is None: - log.error(f'No search results found for {url} on page {page}') + log.error(f'No search results found for {url} on page {page_number}') break subgraph = Graph(namespace_manager=namespace_manager) @@ -341,19 +366,18 @@ def parse_graph(self, url: str, fmt: str) -> List[Graph]: if not subgraph.subjects(RDF.type, DCAT.Dataset): raise ValueError("Failed to fetch CSW content") - for node in subgraph.subjects(RDF.type, DCAT.Dataset): - id = subgraph.value(node, DCT.identifier) - kwargs = {'nid': str(node), 'page': page} - kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank' - self.add_item(id, **kwargs) graphs.append(subgraph) + should_stop = do(page_number, subgraph) + if should_stop: + return + next_record = self.next_record_if_should_continue(start, search_results) if not next_record: break start = next_record - page += 1 + page_number += 1 response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA), headers=headers) From cb622a889e06d365b575b041c2b20ea3afeb2922 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 30 Apr 2024 11:36:36 +0200 Subject: [PATCH 02/60] Fix missing owner/org in new datasets --- udata/core/dataset/rdf.py | 2 ++ udata/harvest/backends/base.py | 49 ++++++++++++++++++---------------- udata/harvest/backends/dcat.py | 5 ++-- 3 files changed, 31 insertions(+), 25 deletions(-) diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index 4d782507c3..efcca3f54a 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -320,11 +320,13 @@ def temporal_from_rdf(period_of_time): def contact_point_from_rdf(rdf, dataset): contact_point = rdf.value(DCAT.contactPoint) + print(contact_point) if contact_point: name = rdf_value(contact_point, VCARD.fn) or '' email = (rdf_value(contact_point, VCARD.hasEmail) or rdf_value(contact_point, VCARD.email) or rdf_value(contact_point, DCAT.email)) + print(name) if not email: return email = email.replace('mailto:', '').strip() diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 7180db0db2..0a56880c3d 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -378,7 +378,7 @@ def inner_process_dataset(self, dataset: Optional[Dataset]): raise NotImplementedError def harvest(self): - log.debug(f'Starting harvesting f{self.source.name} (f{self.source.url})…') + log.debug(f'Starting harvesting {self.source.name} ({self.source.url})…') factory = HarvestJob if self.dryrun else HarvestJob.objects.create self.job = factory(status='initialized', started=datetime.utcnow(), @@ -390,25 +390,25 @@ def harvest(self): self.inner_harvest() self.job.status = 'done' except HarvestValidationError as e: - log.info(f'Harvesting validation failed for "{safe_unicode(self.source.name)}" (f{self.source.backend})') + log.info(f'Harvesting validation failed for "{safe_unicode(self.source.name)}" ({self.source.backend})') self.job.status = 'failed' error = HarvestError(message=safe_unicode(e)) self.job.errors.append(error) except Exception as e: - log.exception(f'Harvesting failed for "{safe_unicode(self.source.name)}" (f{self.source.backend})') + log.exception(f'Harvesting failed for "{safe_unicode(self.source.name)}" ({self.source.backend})') self.job.status = 'failed' - error = HarvestError(message=safe_unicode(e)) + error = HarvestError(message=safe_unicode(e), details=traceback.format_exc()) self.job.errors.append(error) finally: self.end_job() def process_dataset(self, remote_id: str, debug_data: dict, **kwargs): - log.debug(f'Processing dataset f{remote_id}…') + log.debug(f'Processing dataset {remote_id}…') # TODO add `type` to `HarvestItem` to differentiate `Dataset` from `Dataservice` item = HarvestItem(status='started', started=datetime.utcnow(), remote_id=remote_id, kwargs=debug_data) @@ -424,6 +424,17 @@ def process_dataset(self, remote_id: str, debug_data: dict, **kwargs): ], }).first() + # TODO check that the existing dataset belongs to the same owner/organization than + # the `HarvestSource`. Or is it always the case? + + if dataset is None: + if self.source.organization: + dataset = Dataset(organization=self.source.organization) + elif self.source.owner: + dataset = Dataset(owner=self.source.owner) + else: + raise Exception(f"HarvestSource#{self.source.id} doesn't have an owner nor an organization") + dataset = self.inner_process_dataset(dataset, **kwargs) if not dataset.harvest: @@ -440,13 +451,6 @@ def process_dataset(self, remote_id: str, debug_data: dict, **kwargs): dataset.harvest.archived = None dataset.archived = None - # TODO permissions checking - if not dataset.organization and not dataset.owner: - if self.source.organization: - dataset.organization = self.source.organization - elif self.source.owner: - dataset.owner = self.source.owner - # TODO: Apply editable mappings if self.dryrun: @@ -456,25 +460,24 @@ def process_dataset(self, remote_id: str, debug_data: dict, **kwargs): item.dataset = dataset item.status = 'done' except HarvestSkipException as e: - log.info('Skipped item %s : %s', item.remote_id, safe_unicode(e)) item.status = 'skipped' + + log.info(f'Skipped item {item.remote_id} : {safe_unicode(e)}') item.errors.append(HarvestError(message=safe_unicode(e))) except HarvestValidationError as e: - log.info('Error validating item %s : %s', item.remote_id, safe_unicode(e)) item.status = 'failed' + + log.info(f'Error validating item {item.remote_id} : {safe_unicode(e)}') item.errors.append(HarvestError(message=safe_unicode(e))) except Exception as e: - log.exception('Error while processing %s : %s', - item.remote_id, - safe_unicode(e)) - error = HarvestError(message=safe_unicode(e), - details=traceback.format_exc()) - item.errors.append(error) item.status = 'failed' + log.exception(f'Error while processing {item.remote_id} : {safe_unicode(e)}') - item.ended = datetime.utcnow() - self.save_job() - + error = HarvestError(message=safe_unicode(e), details=traceback.format_exc()) + item.errors.append(error) + finally: + item.ended = datetime.utcnow() + self.save_job() def save_job(self): if not self.dryrun: diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index ddd78f738d..d7bfd23f9b 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -146,15 +146,16 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: return graphs - def process_datasets(self, page_number, page): + def process_datasets(self, page_number: int, page: Graph): for node in page.subjects(RDF.type, DCAT.Dataset): + print(node.__class__) remote_id = page.value(node, DCT.identifier) should_stop = self.process_dataset(remote_id, debug_data = {'page_number': page_number}, page=page, node=node) if should_stop: return True - def inner_process_dataset(self, dataset: Dataset, page, node): + def inner_process_dataset(self, dataset: Dataset, page: Graph, node): return dataset_from_rdf(page, dataset, node=node) def get_node_from_item(self, graph, item): From cdd5f01a0c48961fd30bffa0cfcfe6ba90d1fcde Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 30 Apr 2024 11:37:21 +0200 Subject: [PATCH 03/60] Remove prints --- udata/core/dataset/rdf.py | 3 --- udata/harvest/backends/dcat.py | 1 - 2 files changed, 4 deletions(-) diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index efcca3f54a..793cd0c4ef 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -320,13 +320,11 @@ def temporal_from_rdf(period_of_time): def contact_point_from_rdf(rdf, dataset): contact_point = rdf.value(DCAT.contactPoint) - print(contact_point) if contact_point: name = rdf_value(contact_point, VCARD.fn) or '' email = (rdf_value(contact_point, VCARD.hasEmail) or rdf_value(contact_point, VCARD.email) or rdf_value(contact_point, DCAT.email)) - print(name) if not email: return email = email.replace('mailto:', '').strip() @@ -562,7 +560,6 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None): dataset.description = sanitize_html(description) dataset.frequency = frequency_from_rdf(d.value(DCT.accrualPeriodicity)) dataset.contact_point = contact_point_from_rdf(d, dataset) or dataset.contact_point - print(dataset.contact_point) schema = schema_from_rdf(d) if schema: dataset.schema = schema diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index d7bfd23f9b..23b6ae1a6b 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -148,7 +148,6 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: def process_datasets(self, page_number: int, page: Graph): for node in page.subjects(RDF.type, DCAT.Dataset): - print(node.__class__) remote_id = page.value(node, DCT.identifier) should_stop = self.process_dataset(remote_id, debug_data = {'page_number': page_number}, page=page, node=node) From 35ad7149e0eb6b0645d19a4f8e21bd89d4e4c07f Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 30 Apr 2024 11:40:36 +0200 Subject: [PATCH 04/60] Refactor using two functions --- udata/harvest/backends/base.py | 46 ++++++++++++---------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 0a56880c3d..b0d9996e64 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -416,39 +416,11 @@ def process_dataset(self, remote_id: str, debug_data: dict, **kwargs): self.save_job() try: - dataset = Dataset.objects(__raw__={ - 'harvest.remote_id': remote_id, - '$or': [ - {'harvest.domain': self.source.domain}, - {'harvest.source_id': str(self.source.id)}, - ], - }).first() - - # TODO check that the existing dataset belongs to the same owner/organization than - # the `HarvestSource`. Or is it always the case? - - if dataset is None: - if self.source.organization: - dataset = Dataset(organization=self.source.organization) - elif self.source.owner: - dataset = Dataset(owner=self.source.owner) - else: - raise Exception(f"HarvestSource#{self.source.id} doesn't have an owner nor an organization") + dataset = self.get_dataset(remote_id) dataset = self.inner_process_dataset(dataset, **kwargs) - if not dataset.harvest: - dataset.harvest = HarvestDatasetMetadata() - dataset.harvest.domain = self.source.domain - dataset.harvest.remote_id = item.remote_id - dataset.harvest.source_id = str(self.source.id) - dataset.harvest.last_update = datetime.utcnow() - dataset.harvest.backend = self.display_name - - # unset archived status if needed - if dataset.harvest: - dataset.harvest.archived_at = None - dataset.harvest.archived = None + dataset.harvest = self.update_harvest_info(dataset.harvest, remote_id) dataset.archived = None # TODO: Apply editable mappings @@ -479,6 +451,20 @@ def process_dataset(self, remote_id: str, debug_data: dict, **kwargs): item.ended = datetime.utcnow() self.save_job() + def update_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], remote_id: int): + if not harvest: + harvest = HarvestDatasetMetadata() + harvest.domain = self.source.domain + harvest.remote_id = remote_id + harvest.source_id = str(self.source.id) + harvest.last_update = datetime.utcnow() + harvest.backend = self.display_name + + harvest.archived_at = None + harvest.archived = None + + return harvest + def save_job(self): if not self.dryrun: self.job.save() From 76d99d8bdd1b5f79c5be17947cf2935ed3359dfe Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 30 Apr 2024 11:42:21 +0200 Subject: [PATCH 05/60] Add back should_stop --- udata/harvest/backends/base.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index b0d9996e64..a46f63be0d 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -407,7 +407,11 @@ def harvest(self): self.end_job() - def process_dataset(self, remote_id: str, debug_data: dict, **kwargs): + def process_dataset(self, remote_id: str, debug_data: dict, **kwargs) -> bool : + ''' + Return `True` if the parent should stop iterating because we exceed the number + of items to process. + ''' log.debug(f'Processing dataset {remote_id}…') # TODO add `type` to `HarvestItem` to differentiate `Dataset` from `Dataservice` @@ -451,6 +455,9 @@ def process_dataset(self, remote_id: str, debug_data: dict, **kwargs): item.ended = datetime.utcnow() self.save_job() + return self.max_items and len(self.job.items) >= self.max_items + + def update_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], remote_id: int): if not harvest: harvest = HarvestDatasetMetadata() From e90ddcca2913b9e94f8849d0adc876d3164d2d17 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 30 Apr 2024 11:44:30 +0200 Subject: [PATCH 06/60] Add back autoarchive and done with failed items --- udata/harvest/backends/base.py | 7 +++++++ udata/harvest/tasks.py | 28 ++++++++++++++++------------ 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index a46f63be0d..dbeca62e40 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -388,7 +388,14 @@ def harvest(self): try: self.inner_harvest() + + if self.source.autoarchive: + self.autoarchive() + self.job.status = 'done' + + if any(i.status == 'failed' for i in self.job.items): + self.job.status += '-errors' except HarvestValidationError as e: log.info(f'Harvesting validation failed for "{safe_unicode(self.source.name)}" ({self.source.backend})') diff --git a/udata/harvest/tasks.py b/udata/harvest/tasks.py index 59ad789e16..ec36ad1e21 100644 --- a/udata/harvest/tasks.py +++ b/udata/harvest/tasks.py @@ -1,6 +1,7 @@ from celery import chord from flask import current_app +from udata.harvest.backends.base import BaseSyncBackend from udata.tasks import job, get_logger, task from . import backends @@ -18,21 +19,24 @@ def harvest(self, ident): return # Ignore deleted sources Backend = backends.get(current_app, source.backend) backend = Backend(source) - items = backend.perform_initialization() - if items is None: - pass - elif items == 0: - backend.finalize() + + if isinstance(backend, BaseSyncBackend): + backend.harvest() else: - finalize = harvest_job_finalize.s(backend.job.id) - items = [ - harvest_job_item.s(backend.job.id, item.remote_id) - for item in backend.job.items - ] - chord(items)(finalize) + items = backend.perform_initialization() + if items is None: + pass + elif items == 0: + backend.finalize() + else: + finalize = harvest_job_finalize.s(backend.job.id) + items = [ + harvest_job_item.s(backend.job.id, item.remote_id) + for item in backend.job.items + ] + chord(items)(finalize) - @task(ignore_result=False, route='low.harvest') def harvest_job_item(job_id, item_id): log.info('Harvesting item %s for job "%s"', item_id, job_id) From 5dd3c04496761fa3bdddc6c669e62a3b88b01f0b Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Thu, 2 May 2024 16:16:26 +0200 Subject: [PATCH 07/60] Always returns the graphs for debug --- udata/harvest/backends/dcat.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index 41a8f38e54..8c19aacde7 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -140,7 +140,7 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: should_stop = do(page_number, subgraph) if should_stop: - return + return graphs page_number += 1 @@ -261,6 +261,8 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: raise ValueError(f'Failed to query CSW:\n{content}') while tree: graph = Graph(namespace_manager=namespace_manager) + graphs.append(graph) + search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE}) if search_results is None: log.error(f'No search results found for {url} on page {page_number}') @@ -272,9 +274,7 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: should_stop = do(page_number, subgraph) if should_stop: - return - - graphs.append(graph) + return graphs next_record = self.next_record_if_should_continue(start, search_results) if not next_record: @@ -376,7 +376,7 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: should_stop = do(page_number, subgraph) if should_stop: - return + return graphs next_record = self.next_record_if_should_continue(start, search_results) if not next_record: From d8eaf45f526c9daf63a8607407302765885f482b Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Thu, 2 May 2024 16:52:41 +0200 Subject: [PATCH 08/60] Add test for stopping due to HARVEST_MAX_ITEMS --- udata/harvest/tests/test_dcat_backend.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/udata/harvest/tests/test_dcat_backend.py b/udata/harvest/tests/test_dcat_backend.py index c2de9ef24d..14a3af4437 100644 --- a/udata/harvest/tests/test_dcat_backend.py +++ b/udata/harvest/tests/test_dcat_backend.py @@ -240,6 +240,19 @@ def test_harvest_big_catalog(self, rmock): actions.purge_jobs() assert get_from_json(current_app.config.get('HARVEST_GRAPHS_S3_BUCKET'), job.data['filename']) is None + @pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas', HARVEST_MAX_ITEMS=2) + def test_harvest_max_items(self, rmock): + rmock.get('https://example.com/schemas', json=ResourceSchemaMockData.get_mock_data()) + + filename = 'bnodes.xml' + url = mock_dcat(rmock, filename) + org = OrganizationFactory() + source = HarvestSourceFactory(backend='dcat', url=url, organization=org) + + actions.run(source.slug) + + assert Dataset.objects.count() == 2 + assert HarvestJob.objects.first().status == 'done' @pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas') def test_harvest_spatial(self, rmock): From ebf2af756ea8a2df397b0c0bec30e65776d1dad3 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 14 May 2024 08:40:18 +0200 Subject: [PATCH 09/60] Update test backends --- udata/harvest/backends/__init__.py | 2 +- udata/harvest/backends/base.py | 4 +++- udata/harvest/backends/dcat.py | 2 +- udata/harvest/tests/factories.py | 12 +++++++----- udata/harvest/tests/test_actions.py | 4 ++-- udata/harvest/tests/test_base_backend.py | 12 ++++++------ 6 files changed, 20 insertions(+), 16 deletions(-) diff --git a/udata/harvest/backends/__init__.py b/udata/harvest/backends/__init__.py index a054be9144..2863e5b951 100644 --- a/udata/harvest/backends/__init__.py +++ b/udata/harvest/backends/__init__.py @@ -14,4 +14,4 @@ def get_all(app): return get_enabled('udata.harvesters', app) -from .base import BaseBackend, HarvestFilter, HarvestFeature # flake8: noqa +from .base import BaseSyncBackend, HarvestFilter, HarvestFeature # flake8: noqa diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index dbeca62e40..186be28318 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -413,8 +413,10 @@ def harvest(self): finally: self.end_job() + return self.job + - def process_dataset(self, remote_id: str, debug_data: dict, **kwargs) -> bool : + def process_dataset(self, remote_id: str, debug_data: dict = {}, **kwargs) -> bool : ''' Return `True` if the parent should stop iterating because we exceed the number of items to process. diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index 8c19aacde7..c4ef6ef687 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -16,7 +16,7 @@ from udata.core.dataset.rdf import dataset_from_rdf from udata.storage.s3 import store_as_json, get_from_json -from .base import BaseBackend, BaseSyncBackend +from .base import BaseSyncBackend log = logging.getLogger(__name__) diff --git a/udata/harvest/tests/factories.py b/udata/harvest/tests/factories.py index 5ea9b2c12a..f572e9f05d 100644 --- a/udata/harvest/tests/factories.py +++ b/udata/harvest/tests/factories.py @@ -44,7 +44,7 @@ class Meta: DEFAULT_COUNT = 3 -class FactoryBackend(backends.BaseBackend): +class FactoryBackend(backends.BaseSyncBackend): name = 'factory' filters = ( backends.HarvestFilter('Test', 'test', int, 'An integer'), @@ -55,14 +55,16 @@ class FactoryBackend(backends.BaseBackend): backends.HarvestFeature('toggled', 'Toggled', 'A togglable', True), ) - def initialize(self): + def inner_harvest(self): mock_initialize.send(self) for i in range(self.config.get('count', DEFAULT_COUNT)): - self.add_item(i) + self.process_dataset(i, item=i) - def process(self, item): + def inner_process_dataset(self, dataset, item): mock_process.send(self, item=item) - return DatasetFactory.build(title='dataset-{0}'.format(item.remote_id)) + + dataset.title = f'dataset-{item}' + dataset.save() class MockBackendsMixin(object): diff --git a/udata/harvest/tests/test_actions.py b/udata/harvest/tests/test_actions.py index a08efb4bf7..3210a8ba6d 100644 --- a/udata/harvest/tests/test_actions.py +++ b/udata/harvest/tests/test_actions.py @@ -24,7 +24,7 @@ HarvestSource, HarvestJob, HarvestError, VALIDATION_PENDING, VALIDATION_ACCEPTED, VALIDATION_REFUSED ) -from ..backends import BaseBackend +from ..backends import BaseSyncBackend from .. import actions, signals @@ -38,7 +38,7 @@ class HarvestActionsTest: def test_list_backends(self): for backend in actions.list_backends(): - assert issubclass(backend, BaseBackend) + assert issubclass(backend, BaseSyncBackend) def test_list_sources(self): assert actions.list_sources() == [] diff --git a/udata/harvest/tests/test_base_backend.py b/udata/harvest/tests/test_base_backend.py index fd1f862097..fcd20bf569 100644 --- a/udata/harvest/tests/test_base_backend.py +++ b/udata/harvest/tests/test_base_backend.py @@ -13,7 +13,7 @@ from .factories import HarvestSourceFactory -from ..backends import BaseBackend, HarvestFilter, HarvestFeature +from ..backends import BaseSyncBackend, HarvestFilter, HarvestFeature from ..exceptions import HarvestException @@ -21,7 +21,7 @@ class Unknown: pass -class FakeBackend(BaseBackend): +class FakeBackend(BaseSyncBackend): filters = ( HarvestFilter('First filter', 'first', str), HarvestFilter('Second filter', 'second', str), @@ -31,12 +31,12 @@ class FakeBackend(BaseBackend): HarvestFeature('enabled', 'A test feature enabled by default', default=True), ) - def initialize(self): + def inner_harvest(self): for i in range(self.source.config.get('nb_datasets', 3)): - self.add_item('fake-{0}'.format(i)) + remote_id = f'fake-{i}' + self.process_dataset(remote_id) - def process(self, item): - dataset = self.get_dataset(item.remote_id) + def inner_process_dataset(self, dataset): for key, value in DatasetFactory.as_dict(visible=True).items(): setattr(dataset, key, value) if self.source.config.get('last_modified'): From a2701af7758f32b30c60bc300ff6763c2fd567c0 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 14 May 2024 09:13:17 +0200 Subject: [PATCH 10/60] Fix some tests --- udata/harvest/backends/base.py | 2 +- udata/harvest/tests/factories.py | 14 +++++++++----- udata/harvest/tests/test_actions.py | 4 ++-- udata/harvest/tests/test_base_backend.py | 9 +++++---- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 186be28318..ef566114cc 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -374,7 +374,7 @@ class BaseSyncBackend(BaseBackend): def inner_harvest(self): raise NotImplementedError - def inner_process_dataset(self, dataset: Optional[Dataset]): + def inner_process_dataset(self, dataset: Optional[Dataset]) -> Dataset: raise NotImplementedError def harvest(self): diff --git a/udata/harvest/tests/factories.py b/udata/harvest/tests/factories.py index f572e9f05d..e484db4c7c 100644 --- a/udata/harvest/tests/factories.py +++ b/udata/harvest/tests/factories.py @@ -58,13 +58,17 @@ class FactoryBackend(backends.BaseSyncBackend): def inner_harvest(self): mock_initialize.send(self) for i in range(self.config.get('count', DEFAULT_COUNT)): - self.process_dataset(i, item=i) + remote_id = f'{i}' + should_stop = self.process_dataset(remote_id, id=remote_id) + if should_stop: + return - def inner_process_dataset(self, dataset, item): - mock_process.send(self, item=item) + def inner_process_dataset(self, dataset, id): + mock_process.send(self, item=id) - dataset.title = f'dataset-{item}' - dataset.save() + dataset.title = f'dataset-{id}' + + return dataset class MockBackendsMixin(object): diff --git a/udata/harvest/tests/test_actions.py b/udata/harvest/tests/test_actions.py index 3210a8ba6d..68b5a30a46 100644 --- a/udata/harvest/tests/test_actions.py +++ b/udata/harvest/tests/test_actions.py @@ -580,7 +580,7 @@ def init(self): def test_error_on_item(self): def process(self, item): - if item.remote_id == '1': + if item == '1': raise ValueError('test') source = HarvestSourceFactory(backend='factory') @@ -723,7 +723,7 @@ def init(self): def test_preview_with_error_on_item(self): def process(self, item): - if item.remote_id == '1': + if item == '1': raise ValueError('test') source = HarvestSourceFactory(backend='factory') diff --git a/udata/harvest/tests/test_base_backend.py b/udata/harvest/tests/test_base_backend.py index fcd20bf569..8f4f77a564 100644 --- a/udata/harvest/tests/test_base_backend.py +++ b/udata/harvest/tests/test_base_backend.py @@ -33,8 +33,10 @@ class FakeBackend(BaseSyncBackend): def inner_harvest(self): for i in range(self.source.config.get('nb_datasets', 3)): - remote_id = f'fake-{i}' - self.process_dataset(remote_id) + remote_id = f'{i}' + should_stop = self.process_dataset(remote_id) + if should_stop: + return def inner_process_dataset(self, dataset): for key, value in DatasetFactory.as_dict(visible=True).items(): @@ -219,8 +221,7 @@ def test_autoarchive(self, app): assert 'archived_at' not in dataset_no_arch.harvest # test unarchive: archive manually then relaunch harvest - q = {'harvest__remote_id': 'fake-1'} - dataset = Dataset.objects.get(**q) + dataset = Dataset.objects.get(**{'harvest__remote_id': 'fake-1'}) dataset.archived = datetime.utcnow() dataset.harvest.archived = 'not-on-remote' dataset.harvest.archived_at = datetime.utcnow() From a163b70f347cd2bb6591cac20a08b9fc8d07f1e3 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 14 May 2024 09:14:09 +0200 Subject: [PATCH 11/60] Revert ID change for FakeBackend --- udata/harvest/tests/test_base_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udata/harvest/tests/test_base_backend.py b/udata/harvest/tests/test_base_backend.py index 8f4f77a564..93f62aab86 100644 --- a/udata/harvest/tests/test_base_backend.py +++ b/udata/harvest/tests/test_base_backend.py @@ -33,7 +33,7 @@ class FakeBackend(BaseSyncBackend): def inner_harvest(self): for i in range(self.source.config.get('nb_datasets', 3)): - remote_id = f'{i}' + remote_id = f'fake-{i}' should_stop = self.process_dataset(remote_id) if should_stop: return From 3693bd2dcadec98d063f13c11541c6385850ba22 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 14 May 2024 13:46:29 +0200 Subject: [PATCH 12/60] Simplify SyncBackend --- udata/harvest/backends/base.py | 12 +++++------- udata/harvest/backends/dcat.py | 8 ++++++-- udata/harvest/tests/factories.py | 13 +++++++------ udata/harvest/tests/test_base_backend.py | 5 ++++- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index ef566114cc..88833c9b24 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -374,7 +374,7 @@ class BaseSyncBackend(BaseBackend): def inner_harvest(self): raise NotImplementedError - def inner_process_dataset(self, dataset: Optional[Dataset]) -> Dataset: + def inner_process_dataset(self, item: HarvestItem) -> Dataset: raise NotImplementedError def harvest(self): @@ -397,7 +397,7 @@ def harvest(self): if any(i.status == 'failed' for i in self.job.items): self.job.status += '-errors' except HarvestValidationError as e: - log.info(f'Harvesting validation failed for "{safe_unicode(self.source.name)}" ({self.source.backend})') + log.exception(f'Harvesting validation failed for "{safe_unicode(self.source.name)}" ({self.source.backend})') self.job.status = 'failed' @@ -416,7 +416,7 @@ def harvest(self): return self.job - def process_dataset(self, remote_id: str, debug_data: dict = {}, **kwargs) -> bool : + def process_dataset(self, remote_id: str, **kwargs) -> bool : ''' Return `True` if the parent should stop iterating because we exceed the number of items to process. @@ -424,14 +424,12 @@ def process_dataset(self, remote_id: str, debug_data: dict = {}, **kwargs) -> bo log.debug(f'Processing dataset {remote_id}…') # TODO add `type` to `HarvestItem` to differentiate `Dataset` from `Dataservice` - item = HarvestItem(status='started', started=datetime.utcnow(), remote_id=remote_id, kwargs=debug_data) + item = HarvestItem(status='started', started=datetime.utcnow(), remote_id=remote_id) self.job.items.append(item) self.save_job() try: - dataset = self.get_dataset(remote_id) - - dataset = self.inner_process_dataset(dataset, **kwargs) + dataset = self.inner_process_dataset(item, **kwargs) dataset.harvest = self.update_harvest_info(dataset.harvest, remote_id) dataset.archived = None diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index c4ef6ef687..e91e9157c4 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -15,6 +15,7 @@ ) from udata.core.dataset.rdf import dataset_from_rdf from udata.storage.s3 import store_as_json, get_from_json +from udata.harvest.models import HarvestItem from .base import BaseSyncBackend @@ -149,12 +150,15 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: def process_datasets(self, page_number: int, page: Graph): for node in page.subjects(RDF.type, DCAT.Dataset): remote_id = page.value(node, DCT.identifier) - should_stop = self.process_dataset(remote_id, debug_data = {'page_number': page_number}, page=page, node=node) + should_stop = self.process_dataset(remote_id, page_number=page_number, page=page, node=node) if should_stop: return True - def inner_process_dataset(self, dataset: Dataset, page: Graph, node): + def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph, node): + item.kwargs['page_number'] = page_number + + dataset = self.get_dataset(item.remote_id) return dataset_from_rdf(page, dataset, node=node) def get_node_from_item(self, graph, item): diff --git a/udata/harvest/tests/factories.py b/udata/harvest/tests/factories.py index e484db4c7c..0bcdca3f5e 100644 --- a/udata/harvest/tests/factories.py +++ b/udata/harvest/tests/factories.py @@ -6,9 +6,10 @@ from udata.factories import ModelFactory from udata.core.dataset.factories import DatasetFactory +from udata.core.dataset.models import Dataset from .. import backends -from ..models import HarvestSource, HarvestJob +from ..models import HarvestItem, HarvestSource, HarvestJob def dtfactory(start, end): @@ -58,15 +59,15 @@ class FactoryBackend(backends.BaseSyncBackend): def inner_harvest(self): mock_initialize.send(self) for i in range(self.config.get('count', DEFAULT_COUNT)): - remote_id = f'{i}' - should_stop = self.process_dataset(remote_id, id=remote_id) + should_stop = self.process_dataset(str(i)) if should_stop: return - def inner_process_dataset(self, dataset, id): - mock_process.send(self, item=id) + def inner_process_dataset(self, item: HarvestItem): + mock_process.send(self, item=item.remote_id) - dataset.title = f'dataset-{id}' + dataset = self.get_dataset(item.remote_id) + dataset.title = f'dataset-{item.remote_id}' return dataset diff --git a/udata/harvest/tests/test_base_backend.py b/udata/harvest/tests/test_base_backend.py index 93f62aab86..b1167bed78 100644 --- a/udata/harvest/tests/test_base_backend.py +++ b/udata/harvest/tests/test_base_backend.py @@ -5,6 +5,7 @@ from dateutil.parser import parse from voluptuous import Schema +from udata.harvest.models import HarvestItem from udata.utils import faker from udata.core.dataset import tasks from udata.core.dataset.factories import DatasetFactory @@ -38,7 +39,9 @@ def inner_harvest(self): if should_stop: return - def inner_process_dataset(self, dataset): + def inner_process_dataset(self, item: HarvestItem): + dataset = self.get_dataset(item.remote_id) + for key, value in DatasetFactory.as_dict(visible=True).items(): setattr(dataset, key, value) if self.source.config.get('last_modified'): From a71b46e109f2a63fa4ef4e44a83ac2093f19b450 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 21 May 2024 15:18:04 +0200 Subject: [PATCH 13/60] fix wrong remote_id --- udata/harvest/backends/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 88833c9b24..24b33f7ae2 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -431,7 +431,8 @@ def process_dataset(self, remote_id: str, **kwargs) -> bool : try: dataset = self.inner_process_dataset(item, **kwargs) - dataset.harvest = self.update_harvest_info(dataset.harvest, remote_id) + # Use `item.remote_id` because `inner_process_dataset` could have modified it. + dataset.harvest = self.update_harvest_info(dataset.harvest, item.remote_id) dataset.archived = None # TODO: Apply editable mappings From fc98e288c1b35ab64c7104cb69a531220fb86ed0 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Thu, 23 May 2024 15:19:15 +0200 Subject: [PATCH 14/60] Remove dead code --- udata/harvest/backends/__init__.py | 2 +- udata/harvest/backends/base.py | 323 +++++++---------------- udata/harvest/backends/dcat.py | 4 +- udata/harvest/tasks.py | 18 +- udata/harvest/tests/factories.py | 2 +- udata/harvest/tests/test_actions.py | 4 +- udata/harvest/tests/test_base_backend.py | 4 +- 7 files changed, 103 insertions(+), 254 deletions(-) diff --git a/udata/harvest/backends/__init__.py b/udata/harvest/backends/__init__.py index 2863e5b951..a054be9144 100644 --- a/udata/harvest/backends/__init__.py +++ b/udata/harvest/backends/__init__.py @@ -14,4 +14,4 @@ def get_all(app): return get_enabled('udata.harvesters', app) -from .base import BaseSyncBackend, HarvestFilter, HarvestFeature # flake8: noqa +from .base import BaseBackend, HarvestFilter, HarvestFeature # flake8: noqa diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 24b33f7ae2..25c6b9933b 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -128,235 +128,7 @@ def has_feature(self, key): def get_filters(self): return self.config.get('filters', []) - def harvest(self): - '''Start the harvesting process''' - if self.perform_initialization() is not None: - self.process_items() - self.finalize() - return self.job - - def perform_initialization(self): - '''Initialize the harvesting for a given job''' - log.debug('Initializing backend') - factory = HarvestJob if self.dryrun else HarvestJob.objects.create - self.job = factory(status='initializing', - started=datetime.utcnow(), - source=self.source) - - before_harvest_job.send(self) - - try: - self.initialize() - self.job.status = 'initialized' - if not self.dryrun: - self.job.save() - except HarvestValidationError as e: - log.info('Initialization failed for "%s" (%s)', - safe_unicode(self.source.name), self.source.backend) - error = HarvestError(message=safe_unicode(e)) - self.job.errors.append(error) - self.job.status = 'failed' - self.end() - return None - except Exception as e: - self.job.status = 'failed' - error = HarvestError(message=safe_unicode(e)) - self.job.errors.append(error) - self.end() - msg = 'Initialization failed for "{0.name}" ({0.backend})' - log.exception(msg.format(self.source)) - return None - - if self.max_items: - self.job.items = self.job.items[:self.max_items] - - if self.job.items: - log.debug('Queued %s items', len(self.job.items)) - - return len(self.job.items) - - def initialize(self): - raise NotImplementedError - - def process_items(self): - '''Process the data identified in the initialize stage''' - for item in self.job.items: - self.process_item(item) - - def process_item(self, item): - log.debug('Processing: %s', item.remote_id) - item.status = 'started' - item.started = datetime.utcnow() - if not self.dryrun: - self.job.save() - - try: - dataset = self.process(item) - if not dataset.harvest: - dataset.harvest = HarvestDatasetMetadata() - dataset.harvest.domain = self.source.domain - dataset.harvest.remote_id = item.remote_id - dataset.harvest.source_id = str(self.source.id) - dataset.harvest.last_update = datetime.utcnow() - dataset.harvest.backend = self.display_name - - # unset archived status if needed - if dataset.harvest: - dataset.harvest.archived_at = None - dataset.harvest.archived = None - dataset.archived = None - - # TODO permissions checking - if not dataset.organization and not dataset.owner: - if self.source.organization: - dataset.organization = self.source.organization - elif self.source.owner: - dataset.owner = self.source.owner - - # TODO: Apply editble mappings - - if self.dryrun: - dataset.validate() - else: - dataset.save() - item.dataset = dataset - item.status = 'done' - except HarvestSkipException as e: - log.info('Skipped item %s : %s', item.remote_id, safe_unicode(e)) - item.status = 'skipped' - item.errors.append(HarvestError(message=safe_unicode(e))) - except HarvestValidationError as e: - log.info('Error validating item %s : %s', item.remote_id, safe_unicode(e)) - item.status = 'failed' - item.errors.append(HarvestError(message=safe_unicode(e))) - except Exception as e: - log.exception('Error while processing %s : %s', - item.remote_id, - safe_unicode(e)) - error = HarvestError(message=safe_unicode(e), - details=traceback.format_exc()) - item.errors.append(error) - item.status = 'failed' - - item.ended = datetime.utcnow() - if not self.dryrun: - self.job.save() - - def autoarchive(self): - ''' - Archive items that exist on the local instance but not on remote platform - after a grace period of HARVEST_AUTOARCHIVE_GRACE_DAYS days. - ''' - log.debug('Running autoarchive') - limit_days = current_app.config['HARVEST_AUTOARCHIVE_GRACE_DAYS'] - limit_date = date.today() - timedelta(days=limit_days) - remote_ids = [i.remote_id for i in self.job.items if i.status != 'archived'] - q = { - 'harvest__source_id': str(self.source.id), - 'harvest__remote_id__nin': remote_ids, - 'harvest__last_update__lt': limit_date - } - local_items_not_on_remote = Dataset.objects.filter(**q) - - for dataset in local_items_not_on_remote: - if not dataset.harvest.archived_at: - archive_harvested_dataset(dataset, reason='not-on-remote', dryrun=self.dryrun) - # add a HarvestItem to the job list (useful for report) - # even when archiving has already been done (useful for debug) - item = self.add_item(dataset.harvest.remote_id) - item.dataset = dataset - item.status = 'archived' - - if not self.dryrun: - self.job.save() - - def process(self, item): - raise NotImplementedError - - def add_item(self, identifier, *args, **kwargs): - item = HarvestItem(remote_id=str(identifier), args=args, kwargs=kwargs) - self.job.items.append(item) - return item - - def finalize(self): - if self.source.autoarchive: - self.autoarchive() - self.job.status = 'done' - if any(i.status == 'failed' for i in self.job.items): - self.job.status += '-errors' - self.end() - - def end(self): - self.job.ended = datetime.utcnow() - if not self.dryrun: - self.job.save() - after_harvest_job.send(self) - - def get_dataset(self, remote_id): - '''Get or create a dataset given its remote ID (and its source) - We first try to match `source_id` to be source domain independent - ''' - dataset = Dataset.objects(__raw__={ - 'harvest.remote_id': remote_id, - '$or': [ - {'harvest.domain': self.source.domain}, - {'harvest.source_id': str(self.source.id)}, - ], - }).first() - - if dataset: - return dataset - - if self.source.organization: - return Dataset(organization=self.source.organization) - elif self.source.owner: - return Dataset(owner=self.source.owner) - - return Dataset() - - def validate(self, data, schema): - '''Perform a data validation against a given schema. - - :param data: an object to validate - :param schema: a Voluptous schema to validate against - ''' - try: - return schema(data) - except MultipleInvalid as ie: - errors = [] - for error in ie.errors: - if error.path: - field = '.'.join(str(p) for p in error.path) - path = error.path - value = data - while path: - attr = path.pop(0) - try: - if isinstance(value, (list, tuple)): - attr = int(attr) - value = value[attr] - except Exception: - value = None - - txt = safe_unicode(error).replace('for dictionary value', '') - txt = txt.strip() - if isinstance(error, RequiredFieldInvalid): - msg = '[{0}] {1}' - else: - msg = '[{0}] {1}: {2}' - try: - msg = msg.format(field, txt, str(value)) - except Exception: - msg = '[{0}] {1}'.format(field, txt) - - else: - msg = str(error) - errors.append(msg) - msg = '\n- '.join(['Validation error:'] + errors) - raise HarvestValidationError(msg) - -class BaseSyncBackend(BaseBackend): """ Parent class that wrap children methods to add error management and debug logs. @@ -414,7 +186,6 @@ def harvest(self): self.end_job() return self.job - def process_dataset(self, remote_id: str, **kwargs) -> bool : ''' @@ -489,4 +260,96 @@ def end_job(self): if not self.dryrun: self.job.save() - after_harvest_job.send(self) \ No newline at end of file + after_harvest_job.send(self) + + def autoarchive(self): + ''' + Archive items that exist on the local instance but not on remote platform + after a grace period of HARVEST_AUTOARCHIVE_GRACE_DAYS days. + ''' + log.debug('Running autoarchive') + limit_days = current_app.config['HARVEST_AUTOARCHIVE_GRACE_DAYS'] + limit_date = date.today() - timedelta(days=limit_days) + remote_ids = [i.remote_id for i in self.job.items if i.status != 'archived'] + q = { + 'harvest__source_id': str(self.source.id), + 'harvest__remote_id__nin': remote_ids, + 'harvest__last_update__lt': limit_date + } + local_items_not_on_remote = Dataset.objects.filter(**q) + + for dataset in local_items_not_on_remote: + if not dataset.harvest.archived_at: + archive_harvested_dataset(dataset, reason='not-on-remote', dryrun=self.dryrun) + # add a HarvestItem to the job list (useful for report) + # even when archiving has already been done (useful for debug) + self.job.items.append(HarvestItem( + remote_id=str(dataset.harvest.remote_id), + dataset=dataset, + status='archived' + )) + + self.save_job() + + def get_dataset(self, remote_id): + '''Get or create a dataset given its remote ID (and its source) + We first try to match `source_id` to be source domain independent + ''' + dataset = Dataset.objects(__raw__={ + 'harvest.remote_id': remote_id, + '$or': [ + {'harvest.domain': self.source.domain}, + {'harvest.source_id': str(self.source.id)}, + ], + }).first() + + if dataset: + return dataset + + if self.source.organization: + return Dataset(organization=self.source.organization) + elif self.source.owner: + return Dataset(owner=self.source.owner) + + return Dataset() + + def validate(self, data, schema): + '''Perform a data validation against a given schema. + + :param data: an object to validate + :param schema: a Voluptous schema to validate against + ''' + try: + return schema(data) + except MultipleInvalid as ie: + errors = [] + for error in ie.errors: + if error.path: + field = '.'.join(str(p) for p in error.path) + path = error.path + value = data + while path: + attr = path.pop(0) + try: + if isinstance(value, (list, tuple)): + attr = int(attr) + value = value[attr] + except Exception: + value = None + + txt = safe_unicode(error).replace('for dictionary value', '') + txt = txt.strip() + if isinstance(error, RequiredFieldInvalid): + msg = '[{0}] {1}' + else: + msg = '[{0}] {1}: {2}' + try: + msg = msg.format(field, txt, str(value)) + except Exception: + msg = '[{0}] {1}'.format(field, txt) + + else: + msg = str(error) + errors.append(msg) + msg = '\n- '.join(['Validation error:'] + errors) + raise HarvestValidationError(msg) \ No newline at end of file diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index e91e9157c4..3703ce9657 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -17,7 +17,7 @@ from udata.storage.s3 import store_as_json, get_from_json from udata.harvest.models import HarvestItem -from .base import BaseSyncBackend +from .base import BaseBackend log = logging.getLogger(__name__) @@ -58,7 +58,7 @@ def extract_graph(source, target, node, specs): extract_graph(source, target, o, specs[p]) -class DcatBackend(BaseSyncBackend): +class DcatBackend(BaseBackend): display_name = 'DCAT' def inner_harvest(self): diff --git a/udata/harvest/tasks.py b/udata/harvest/tasks.py index ec36ad1e21..67dcf53139 100644 --- a/udata/harvest/tasks.py +++ b/udata/harvest/tasks.py @@ -1,7 +1,6 @@ from celery import chord from flask import current_app -from udata.harvest.backends.base import BaseSyncBackend from udata.tasks import job, get_logger, task from . import backends @@ -20,21 +19,8 @@ def harvest(self, ident): Backend = backends.get(current_app, source.backend) backend = Backend(source) - if isinstance(backend, BaseSyncBackend): - backend.harvest() - else: - items = backend.perform_initialization() - if items is None: - pass - elif items == 0: - backend.finalize() - else: - finalize = harvest_job_finalize.s(backend.job.id) - items = [ - harvest_job_item.s(backend.job.id, item.remote_id) - for item in backend.job.items - ] - chord(items)(finalize) + backend.harvest() + @task(ignore_result=False, route='low.harvest') diff --git a/udata/harvest/tests/factories.py b/udata/harvest/tests/factories.py index 0bcdca3f5e..d9ff672ae8 100644 --- a/udata/harvest/tests/factories.py +++ b/udata/harvest/tests/factories.py @@ -45,7 +45,7 @@ class Meta: DEFAULT_COUNT = 3 -class FactoryBackend(backends.BaseSyncBackend): +class FactoryBackend(backends.BaseBackend): name = 'factory' filters = ( backends.HarvestFilter('Test', 'test', int, 'An integer'), diff --git a/udata/harvest/tests/test_actions.py b/udata/harvest/tests/test_actions.py index 68b5a30a46..c80d8d992f 100644 --- a/udata/harvest/tests/test_actions.py +++ b/udata/harvest/tests/test_actions.py @@ -24,7 +24,7 @@ HarvestSource, HarvestJob, HarvestError, VALIDATION_PENDING, VALIDATION_ACCEPTED, VALIDATION_REFUSED ) -from ..backends import BaseSyncBackend +from ..backends import BaseBackend from .. import actions, signals @@ -38,7 +38,7 @@ class HarvestActionsTest: def test_list_backends(self): for backend in actions.list_backends(): - assert issubclass(backend, BaseSyncBackend) + assert issubclass(backend, BaseBackend) def test_list_sources(self): assert actions.list_sources() == [] diff --git a/udata/harvest/tests/test_base_backend.py b/udata/harvest/tests/test_base_backend.py index b1167bed78..67462a4f14 100644 --- a/udata/harvest/tests/test_base_backend.py +++ b/udata/harvest/tests/test_base_backend.py @@ -14,7 +14,7 @@ from .factories import HarvestSourceFactory -from ..backends import BaseSyncBackend, HarvestFilter, HarvestFeature +from ..backends import BaseBackend, HarvestFilter, HarvestFeature from ..exceptions import HarvestException @@ -22,7 +22,7 @@ class Unknown: pass -class FakeBackend(BaseSyncBackend): +class FakeBackend(BaseBackend): filters = ( HarvestFilter('First filter', 'first', str), HarvestFilter('Second filter', 'second', str), From 20ce7e4c3e8b6aaa909b8df43a6212a7c45dac08 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Thu, 23 May 2024 15:38:18 +0200 Subject: [PATCH 15/60] update comment --- udata/harvest/backends/base.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 25c6b9933b..c18105ffea 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -137,9 +137,16 @@ def get_filters(self): harvest -> inner_harvest() / - process_dataset <------ + process_dataset (create HarvestItem) <------ \ - --------> inner_process_dataset() + --------> inner_process_dataset() (call get_dataset() and update object) + + + process_dataset: + 1. Create HarvestItem + 2. Call inner_process_dataset(item) + 3. Save HarvestItem (dryrun) + 4. Save dataset (dryrun) """ From b9b41e23f2ee61b775fb24ebd0176c31b2bfbea9 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Mon, 27 May 2024 14:45:10 +0200 Subject: [PATCH 16/60] Move docstring --- udata/harvest/backends/base.py | 43 ++++++++++++++++------------------ 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index c18105ffea..0a9cbc16b2 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -69,7 +69,26 @@ def as_dict(self): class BaseBackend(object): - '''Base class for Harvester implementations''' + """ + Base class that wrap children methods to add error management and debug logs. + Also provides a few helpers needed on all or some backends. + + The flow is the following: + Parent Child + + harvest -> inner_harvest() + / + process_dataset (create HarvestItem) <------ + \ + --------> inner_process_dataset() (call get_dataset() and update object) + + + process_dataset: + 1. Create HarvestItem + 2. Call inner_process_dataset(item) + 3. Save HarvestItem (dryrun) + 4. Save dataset (dryrun) + """ name = None display_name = None @@ -128,28 +147,6 @@ def has_feature(self, key): def get_filters(self): return self.config.get('filters', []) - - """ - Parent class that wrap children methods to add error management and debug logs. - - The flow is the following: - Parent Child - - harvest -> inner_harvest() - / - process_dataset (create HarvestItem) <------ - \ - --------> inner_process_dataset() (call get_dataset() and update object) - - - process_dataset: - 1. Create HarvestItem - 2. Call inner_process_dataset(item) - 3. Save HarvestItem (dryrun) - 4. Save dataset (dryrun) - - """ - def inner_harvest(self): raise NotImplementedError From 0a6dfa56fa38fa1711309de06f4d61d99d84b6f7 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Mon, 27 May 2024 14:47:39 +0200 Subject: [PATCH 17/60] Switch is_done() do its own function --- udata/harvest/backends/base.py | 9 +++------ udata/harvest/backends/dcat.py | 4 ++-- udata/harvest/tests/factories.py | 4 ++-- udata/harvest/tests/test_base_backend.py | 4 ++-- 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 0a9cbc16b2..60eb5447a7 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -191,11 +191,7 @@ def harvest(self): return self.job - def process_dataset(self, remote_id: str, **kwargs) -> bool : - ''' - Return `True` if the parent should stop iterating because we exceed the number - of items to process. - ''' + def process_dataset(self, remote_id: str, **kwargs): log.debug(f'Processing dataset {remote_id}…') # TODO add `type` to `HarvestItem` to differentiate `Dataset` from `Dataservice` @@ -238,9 +234,10 @@ def process_dataset(self, remote_id: str, **kwargs) -> bool : item.ended = datetime.utcnow() self.save_job() + def is_done(self) -> bool: + '''Should be called after process_dataset to know if we reach the max items''' return self.max_items and len(self.job.items) >= self.max_items - def update_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], remote_id: int): if not harvest: harvest = HarvestDatasetMetadata() diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index 3703ce9657..c859ffa0e4 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -150,9 +150,9 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: def process_datasets(self, page_number: int, page: Graph): for node in page.subjects(RDF.type, DCAT.Dataset): remote_id = page.value(node, DCT.identifier) - should_stop = self.process_dataset(remote_id, page_number=page_number, page=page, node=node) + self.process_dataset(remote_id, page_number=page_number, page=page, node=node) - if should_stop: + if self.is_done(): return True def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph, node): diff --git a/udata/harvest/tests/factories.py b/udata/harvest/tests/factories.py index d9ff672ae8..579c51f9bb 100644 --- a/udata/harvest/tests/factories.py +++ b/udata/harvest/tests/factories.py @@ -59,8 +59,8 @@ class FactoryBackend(backends.BaseBackend): def inner_harvest(self): mock_initialize.send(self) for i in range(self.config.get('count', DEFAULT_COUNT)): - should_stop = self.process_dataset(str(i)) - if should_stop: + self.process_dataset(str(i)) + if self.is_done(): return def inner_process_dataset(self, item: HarvestItem): diff --git a/udata/harvest/tests/test_base_backend.py b/udata/harvest/tests/test_base_backend.py index 67462a4f14..b591be350b 100644 --- a/udata/harvest/tests/test_base_backend.py +++ b/udata/harvest/tests/test_base_backend.py @@ -35,8 +35,8 @@ class FakeBackend(BaseBackend): def inner_harvest(self): for i in range(self.source.config.get('nb_datasets', 3)): remote_id = f'fake-{i}' - should_stop = self.process_dataset(remote_id) - if should_stop: + self.process_dataset(remote_id) + if self.is_done(): return def inner_process_dataset(self, item: HarvestItem): From 523c754d1e973afdc3f36a3f8e56a1d1f5d671ba Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Mon, 27 May 2024 14:58:47 +0200 Subject: [PATCH 18/60] Rename process_datasets method --- udata/harvest/backends/dcat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index c859ffa0e4..bade92f637 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -68,7 +68,7 @@ def inner_harvest(self): graphs = self.walk_graph( self.source.url, fmt, - lambda page_number, page: self.process_datasets(page_number, page), + lambda page_number, page: self.process_one_datasets_page(page_number, page), ) # TODO call `walk_graph` with `process_dataservices` @@ -147,7 +147,7 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: return graphs - def process_datasets(self, page_number: int, page: Graph): + def process_one_datasets_page(self, page_number: int, page: Graph): for node in page.subjects(RDF.type, DCAT.Dataset): remote_id = page.value(node, DCT.identifier) self.process_dataset(remote_id, page_number=page_number, page=page, node=node) From 064f8eb3c9acd2d3a6327dbc8a49b74362cd7f5b Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Mon, 27 May 2024 15:38:22 +0200 Subject: [PATCH 19/60] yield instead of callback --- udata/harvest/backends/dcat.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index bade92f637..00435dfec6 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -7,7 +7,7 @@ from flask import current_app from datetime import date import json -from typing import List +from typing import Generator, List from udata.core.dataset.models import Dataset from udata.rdf import ( @@ -65,15 +65,13 @@ def inner_harvest(self): fmt = self.get_format() self.job.data = { 'format': fmt } - graphs = self.walk_graph( - self.source.url, - fmt, - lambda page_number, page: self.process_one_datasets_page(page_number, page), - ) + serialized_graphs = [] - # TODO call `walk_graph` with `process_dataservices` + for page_number, page in self.walk_graph(self.source.url, fmt): + self.process_one_datasets_page(page_number, page) + serialized_graphs.append(page.serialize(format=fmt, indent=None)) - serialized_graphs = [graph.serialize(format=fmt, indent=None) for graph in graphs] + # TODO call `walk_graph` with `process_dataservices` # The official MongoDB document size in 16MB. The default value here is 15MB to account for other fields in the document (and for difference between * 1024 vs * 1000). max_harvest_graph_size_in_mongo = current_app.config.get('HARVEST_MAX_CATALOG_SIZE_IN_MONGO') @@ -112,14 +110,13 @@ def get_format(self): raise ValueError(msg) return fmt - def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: + def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]: """ Process the graphs by executing the `do()` callback on each page. Returns all the pages in an array (index is the page number, value is the rdflib.Graph of the page) for debug purposes (saved in `HarvestJob`) """ - graphs = [] page_number = 0 while url: subgraph = Graph(namespace_manager=namespace_manager) @@ -137,15 +134,12 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: pagination = subgraph.resource(pagination) url = url_from_rdf(pagination, prop) break - graphs.append(subgraph) - should_stop = do(page_number, subgraph) - if should_stop: - return graphs + yield page_number, subgraph + if self.is_done(): + return page_number += 1 - - return graphs def process_one_datasets_page(self, page_number: int, page: Graph): for node in page.subjects(RDF.type, DCAT.Dataset): @@ -153,7 +147,7 @@ def process_one_datasets_page(self, page_number: int, page: Graph): self.process_dataset(remote_id, page_number=page_number, page=page, node=node) if self.is_done(): - return True + return def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph, node): item.kwargs['page_number'] = page_number From 263c3a3057e8148f6614fc06db3fbdb5ece46c1b Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Mon, 27 May 2024 15:48:25 +0200 Subject: [PATCH 20/60] fix other backends --- udata/harvest/backends/dcat.py | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index 00435dfec6..2f4161c1d3 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -222,7 +222,7 @@ class CswDcatBackend(DcatBackend): DCAT_SCHEMA = 'http://www.w3.org/ns/dcat#' - def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: + def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]: """ Process the graphs by executing the `do()` callback on each page. @@ -246,7 +246,6 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: ''' headers = {'Content-Type': 'application/xml'} - graphs = [] page_number = 0 start = 1 @@ -258,9 +257,6 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: if tree.tag == '{' + OWS_NAMESPACE + '}ExceptionReport': raise ValueError(f'Failed to query CSW:\n{content}') while tree: - graph = Graph(namespace_manager=namespace_manager) - graphs.append(graph) - search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE}) if search_results is None: log.error(f'No search results found for {url} on page {page_number}') @@ -268,11 +264,10 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: for child in search_results: subgraph = Graph(namespace_manager=namespace_manager) subgraph.parse(data=ET.tostring(child), format=fmt) - graph += subgraph - should_stop = do(page_number, subgraph) - if should_stop: - return graphs + yield page_number, subgraph + if self.is_done(): + return next_record = self.next_record_if_should_continue(start, search_results) if not next_record: @@ -285,8 +280,6 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA), headers=headers).content) - return graphs - class CswIso19139DcatBackend(DcatBackend): ''' @@ -300,7 +293,7 @@ class CswIso19139DcatBackend(DcatBackend): XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl" - def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: + def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]: """ Process the graphs by executing the `do()` callback on each page. @@ -343,7 +336,6 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: ''' headers = {'Content-Type': 'application/xml'} - graphs = [] page_number = 0 start = 1 @@ -370,11 +362,9 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: if not subgraph.subjects(RDF.type, DCAT.Dataset): raise ValueError("Failed to fetch CSW content") - graphs.append(subgraph) - - should_stop = do(page_number, subgraph) - if should_stop: - return graphs + yield page_number, subgraph + if self.is_done(): + return next_record = self.next_record_if_should_continue(start, search_results) if not next_record: @@ -389,5 +379,3 @@ def walk_graph(self, url: str, fmt: str, do) -> List[Graph]: tree_before_transform = ET.fromstring(response.content) tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'") - - return graphs From 9124fc080637d9f6498f854398c7e5589e18a266 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 29 May 2024 15:15:07 +0200 Subject: [PATCH 21/60] Update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e13c2dd3d..ca46423432 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ## Current (in progress) -- Nothing yet +- **breaking change** Harvest backend is now sync [#3030](https://github.com/opendatateam/udata/pull/3030) ## 8.0.1 (2024-05-28) From e9b9af13e854ca146c5ac8f38806f06bce4eca5e Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 29 May 2024 16:44:18 +0200 Subject: [PATCH 22/60] Remove unused process method --- udata/harvest/backends/dcat.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index 2f4161c1d3..1ff0498def 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -161,33 +161,6 @@ def get_node_from_item(self, graph, item): return node raise ValueError(f'Unable to find dataset with DCT.identifier:{item.remote_id}') - def process(self, item): - if item.remote_id == 'None': - raise ValueError('The DCT.identifier is missing on this DCAT.Dataset record') - graph = Graph(namespace_manager=namespace_manager) - - if self.job.data.get('graphs') is not None: - graphs = self.job.data['graphs'] - else: - bucket = current_app.config.get('HARVEST_GRAPHS_S3_BUCKET') - if bucket is None: - raise ValueError(f"No bucket configured but the harvest job item {item.id} on job {self.job.id} doesn't have a graph in MongoDB.") - - graphs = get_from_json(bucket, self.job.data['filename']) - if graphs is None: - raise ValueError(f"The file '{self.job.data['filename']}' is missing in S3 bucket '{bucket}'") - - data = graphs[item.kwargs['page']] - format = self.job.data['format'] - - graph.parse(data=bytes(data, encoding='utf8'), format=format) - node = self.get_node_from_item(graph, item) - - dataset = self.get_dataset(item.remote_id) - dataset = dataset_from_rdf(graph, dataset, node=node) - return dataset - - def next_record_if_should_continue(self, start, search_results): next_record = int(search_results.attrib['nextRecord']) matched_count = int(search_results.attrib['numberOfRecordsMatched']) From 318603ddc7c76a9a496c42c1f7d608477a202c45 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Thu, 30 May 2024 11:26:36 +0200 Subject: [PATCH 23/60] Skip dataset without remote_id --- udata/harvest/backends/dcat.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index 1ff0498def..573c028cc1 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -144,6 +144,10 @@ def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, N def process_one_datasets_page(self, page_number: int, page: Graph): for node in page.subjects(RDF.type, DCAT.Dataset): remote_id = page.value(node, DCT.identifier) + if not remote_id: + log.warning(f"Skipping dataset because no `remote_id`") + continue + self.process_dataset(remote_id, page_number=page_number, page=page, node=node) if self.is_done(): From e1b1bc0a20998fafa14d6782a7191317ae4467a4 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Mon, 29 Apr 2024 14:36:52 +0200 Subject: [PATCH 24/60] Harvest dataservices --- udata/core/dataservices/rdf.py | 28 ++++++++++++++++++++++++ udata/harvest/tests/dcat/bnodes.xml | 15 +++++++++++++ udata/harvest/tests/test_dcat_backend.py | 21 ++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 udata/core/dataservices/rdf.py diff --git a/udata/core/dataservices/rdf.py b/udata/core/dataservices/rdf.py new file mode 100644 index 0000000000..5a30c107b3 --- /dev/null +++ b/udata/core/dataservices/rdf.py @@ -0,0 +1,28 @@ + +from pprint import pprint +from typing import Optional +from rdflib import RDF, Graph +import rdflib +import lxml.etree as ET + +from udata.core.dataservices.models import Dataservice +from udata.core.dataset.rdf import rdf_value, remote_url_from_rdf +from udata.rdf import DCAT, DCT, url_from_rdf + + +def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None, node=None): + ''' + Create or update a dataset from a RDF/DCAT graph + ''' + dataservice = dataservice or Dataservice() + + if node is None: # Assume first match is the only match + node = graph.value(predicate=RDF.type, object=DCAT.DataService) + + d = graph.resource(node) + + dataservice.title = rdf_value(d, DCT.title) + dataservice.base_api_url = rdf_value(d, DCAT.endpointURL) + dataservice.endpoint_description_url = rdf_value(d, DCAT.endpointDescription) + + return dataservice \ No newline at end of file diff --git a/udata/harvest/tests/dcat/bnodes.xml b/udata/harvest/tests/dcat/bnodes.xml index f064949f1d..892f6763dd 100644 --- a/udata/harvest/tests/dcat/bnodes.xml +++ b/udata/harvest/tests/dcat/bnodes.xml @@ -98,6 +98,21 @@ + + + + https://data.paris2024.org/api/explore/v2.1/ + + + + + + + + + + + 2016-12-15T09:19:51.723691 http://data.test.org en diff --git a/udata/harvest/tests/test_dcat_backend.py b/udata/harvest/tests/test_dcat_backend.py index 14a3af4437..c8d5374352 100644 --- a/udata/harvest/tests/test_dcat_backend.py +++ b/udata/harvest/tests/test_dcat_backend.py @@ -9,6 +9,7 @@ from flask import current_app import xml.etree.ElementTree as ET +from udata.core.dataservices.models import Dataservice from udata.harvest.models import HarvestJob from udata.models import Dataset from udata.core.organization.factories import OrganizationFactory @@ -142,6 +143,7 @@ def test_flat_with_blank_nodes(self, rmock): assert datasets['1'].resources[0].mime == 'application/json' @pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas', HARVEST_MAX_CATALOG_SIZE_IN_MONGO=None, HARVEST_GRAPHS_S3_BUCKET="test_bucket", S3_URL="https://example.org", S3_ACCESS_KEY_ID="myUser", S3_SECRET_ACCESS_KEY="password") + def test_flat_with_blank_nodes_xml(self, rmock): rmock.get('https://example.com/schemas', json=ResourceSchemaMockData.get_mock_data()) @@ -161,6 +163,25 @@ def test_flat_with_blank_nodes_xml(self, rmock): assert len(datasets['1'].resources) == 2 assert len(datasets['2'].resources) == 2 + def test_harvest_dataservices(self, rmock): + rmock.get('https://example.com/schemas', json=ResourceSchemaMockData.get_mock_data()) + + filename = 'bnodes.xml' + url = mock_dcat(rmock, filename) + org = OrganizationFactory() + source = HarvestSourceFactory(backend='dcat', + url=url, + organization=org) + + actions.run(source.slug) + + dataservices = Dataservice.objects + + assert len(dataservices) == 1 + assert dataservices[0].title == "Explore API v2" + assert dataservices[0].base_api_url == "https://data.paris2024.org/api/explore/v2.1/" + assert dataservices[0].endpoint_description_url == "https://data.paris2024.org/api/explore/v2.1/swagger.json" + def test_harvest_literal_spatial(self, rmock): url = mock_dcat(rmock, 'evian.json') org = OrganizationFactory() From d536874372fb767cbf3c9b63d61a1d9223863380 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Mon, 29 Apr 2024 16:55:06 +0200 Subject: [PATCH 25/60] Add more data inside dataservices --- udata/core/dataservices/models.py | 15 +++++++ udata/core/dataservices/rdf.py | 42 ++++++++++++++---- udata/core/dataset/rdf.py | 70 ++--------------------------- udata/rdf.py | 73 +++++++++++++++++++++++++++++++ 4 files changed, 124 insertions(+), 76 deletions(-) diff --git a/udata/core/dataservices/models.py b/udata/core/dataservices/models.py index 74e7c97b33..918acfbf1b 100644 --- a/udata/core/dataservices/models.py +++ b/udata/core/dataservices/models.py @@ -31,6 +31,19 @@ def hidden(self): db.Q(deleted_at__ne=None) | db.Q(archived_at__ne=None)) +class HarvestMetadata(db.DynamicEmbeddedDocument): + backend = db.StringField() + + source_id = db.StringField() + source_url = db.StringField() + + dct_identifier = db.StringField() + + remote_id = db.StringField() + remote_url = db.URLField() + + last_harvested_at = db.DateTimeField() + @generate_fields() class Dataservice(WithMetrics, Owned, db.Document): meta = { @@ -116,6 +129,8 @@ class Dataservice(WithMetrics, Owned, db.Document): ) ) + harvest = db.EmbeddedDocumentField(HarvestMetadata) + @function_field(description="Link to the API endpoint for this dataservice") def self_api_url(self): return endpoint_for('api.dataservice', dataservice=self, _external=True) diff --git a/udata/core/dataservices/rdf.py b/udata/core/dataservices/rdf.py index 5a30c107b3..74fb4620c9 100644 --- a/udata/core/dataservices/rdf.py +++ b/udata/core/dataservices/rdf.py @@ -1,16 +1,16 @@ -from pprint import pprint +from datetime import datetime from typing import Optional from rdflib import RDF, Graph -import rdflib -import lxml.etree as ET -from udata.core.dataservices.models import Dataservice -from udata.core.dataset.rdf import rdf_value, remote_url_from_rdf -from udata.rdf import DCAT, DCT, url_from_rdf +from udata.core.dataservices.models import Dataservice, HarvestMetadata +from udata.core.dataset.models import License +from udata.core.dataset.rdf import sanitize_html +from udata.harvest.models import HarvestSource +from udata.rdf import DCAT, DCT, contact_point_from_rdf, rdf_value, theme_labels_from_rdf, themes_from_rdf, url_from_rdf -def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None, node=None): +def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None, node=None, source: Optional[HarvestSource] = None): ''' Create or update a dataset from a RDF/DCAT graph ''' @@ -22,7 +22,31 @@ def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None d = graph.resource(node) dataservice.title = rdf_value(d, DCT.title) - dataservice.base_api_url = rdf_value(d, DCAT.endpointURL) - dataservice.endpoint_description_url = rdf_value(d, DCAT.endpointDescription) + dataservice.description = sanitize_html(d.value(DCT.description) or d.value(DCT.abstract)) + + dataservice.base_api_url = url_from_rdf(d, DCAT.endpointURL) + dataservice.endpoint_description_url = url_from_rdf(d, DCAT.endpointDescription) + + dataservice.contact_point = contact_point_from_rdf(d, dataservice) or dataservice.contact_point + + license = rdf_value(d, DCT.license) + if license is not None: + dataservice.license = License.guess(license) + + dataservice.created_at = rdf_value(d, DCT.issued) + dataservice.metadata_modified_at = rdf_value(d, DCT.modified) + + dataservice.tags = themes_from_rdf(d) + + if not dataservice.harvest: + dataservice.harvest = HarvestMetadata() + + if source is not None: + dataservice.harvest.backend = source.backend + + dataservice.harvest.source_id = source.id + dataservice.harvest.source_url = source.url + + dataservice.harvest.last_harvested_at = datetime.utcnow() return dataservice \ No newline at end of file diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index d8f8c59850..e80c7a03f2 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -18,12 +18,11 @@ from udata import i18n, uris from udata.core.spatial.models import SpatialCoverage -from udata.frontend.markdown import parse_html from udata.core.dataset.models import HarvestDatasetMetadata, HarvestResourceMetadata from udata.models import db, ContactPoint from udata.rdf import ( - DCAT, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, VCARD, RDFS, - namespace_manager, schema_from_rdf, url_from_rdf + DCAT, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, VCARD, RDFS, contact_point_from_rdf, + namespace_manager, rdf_value, sanitize_html, schema_from_rdf, theme_labels_from_rdf, themes_from_rdf, url_from_rdf ) from udata.utils import get_by, safe_unicode from udata.uris import endpoint_for @@ -87,32 +86,6 @@ } -class HTMLDetector(HTMLParser): - def __init__(self, *args, **kwargs): - HTMLParser.__init__(self, *args, **kwargs) - self.elements = set() - - def handle_starttag(self, tag, attrs): - self.elements.add(tag) - - def handle_endtag(self, tag): - self.elements.add(tag) - - -def is_html(text): - parser = HTMLDetector() - parser.feed(text) - return bool(parser.elements) - - -def sanitize_html(text): - text = text.toPython() if isinstance(text, Literal) else '' - if is_html(text): - return parse_html(text) - else: - return text.strip() - - def temporal_to_rdf(daterange, graph=None): if not daterange: return @@ -241,18 +214,6 @@ def dataset_to_rdf(dataset, graph=None): } -def serialize_value(value): - if isinstance(value, (URIRef, Literal)): - return value.toPython() - elif isinstance(value, RdfResource): - return value.identifier.toPython() - - -def rdf_value(obj, predicate, default=None): - value = obj.value(predicate) - return serialize_value(value) if value else default - - def temporal_from_literal(text): ''' Parse a temporal coverage from a literal ie. either: @@ -327,29 +288,6 @@ def temporal_from_rdf(period_of_time): # so we log the error for future investigation and improvement log.warning('Unable to parse temporal coverage', exc_info=True) - -def contact_point_from_rdf(rdf, dataset): - contact_point = rdf.value(DCAT.contactPoint) - if contact_point: - name = rdf_value(contact_point, VCARD.fn) or '' - email = (rdf_value(contact_point, VCARD.hasEmail) - or rdf_value(contact_point, VCARD.email) - or rdf_value(contact_point, DCAT.email)) - if not email: - return - email = email.replace('mailto:', '').strip() - if dataset.organization: - contact_point = ContactPoint.objects( - name=name, email=email, organization=dataset.organization).first() - return (contact_point or - ContactPoint(name=name, email=email, organization=dataset.organization).save()) - elif dataset.owner: - contact_point = ContactPoint.objects( - name=name, email=email, owner=dataset.owner).first() - return (contact_point or - ContactPoint(name=name, email=email, owner=dataset.owner).save()) - - def spatial_from_rdf(graph): geojsons = [] for term in graph.objects(DCT.spatial): @@ -592,9 +530,7 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None): if acronym: dataset.acronym = acronym - tags = [tag.toPython() for tag in d.objects(DCAT.keyword)] - tags += theme_labels_from_rdf(d) - dataset.tags = list(set(tags)) + dataset.tags = themes_from_rdf(d) temporal_coverage = temporal_from_rdf(d.value(DCT.temporal)) if temporal_coverage: diff --git a/udata/rdf.py b/udata/rdf.py index 1203b1bb92..57d0de9077 100644 --- a/udata/rdf.py +++ b/udata/rdf.py @@ -1,6 +1,7 @@ ''' This module centralize udata-wide RDF helpers and configuration ''' +from html.parser import HTMLParser import logging import re @@ -13,8 +14,10 @@ ) from rdflib.util import SUFFIX_FORMAT_MAP, guess_format as raw_guess_format from udata import uris +from udata.core.contact_point.models import ContactPoint from udata.models import Schema from udata.mongo.errors import FieldValidationError +from udata.frontend.markdown import parse_html log = logging.getLogger(__name__) @@ -212,6 +215,42 @@ def want_rdf(): 'totalItems': 'hydra:totalItems', } +def serialize_value(value): + if isinstance(value, (URIRef, Literal)): + return value.toPython() + elif isinstance(value, RdfResource): + return value.identifier.toPython() + + +def rdf_value(obj, predicate, default=None): + value = obj.value(predicate) + return serialize_value(value) if value else default + +class HTMLDetector(HTMLParser): + def __init__(self, *args, **kwargs): + HTMLParser.__init__(self, *args, **kwargs) + self.elements = set() + + def handle_starttag(self, tag, attrs): + self.elements.add(tag) + + def handle_endtag(self, tag): + self.elements.add(tag) + + +def is_html(text): + parser = HTMLDetector() + parser.feed(text) + return bool(parser.elements) + + +def sanitize_html(text): + text = text.toPython() if isinstance(text, Literal) else '' + if is_html(text): + return parse_html(text) + else: + return text.strip() + def url_from_rdf(rdf, prop): ''' @@ -224,6 +263,40 @@ def url_from_rdf(rdf, prop): elif isinstance(value, RdfResource): return value.identifier.toPython() +def theme_labels_from_rdf(rdf): + for theme in rdf.objects(DCAT.theme): + if isinstance(theme, RdfResource): + label = rdf_value(theme, SKOS.prefLabel) + else: + label = theme.toPython() + if label: + yield label + +def themes_from_rdf(rdf): + tags = [tag.toPython() for tag in rdf.objects(DCAT.keyword)] + tags += theme_labels_from_rdf(rdf) + return list(set(tags)) + +def contact_point_from_rdf(rdf, dataset): + contact_point = rdf.value(DCAT.contactPoint) + if contact_point: + name = rdf_value(contact_point, VCARD.fn) or '' + email = (rdf_value(contact_point, VCARD.hasEmail) + or rdf_value(contact_point, VCARD.email) + or rdf_value(contact_point, DCAT.email)) + if not email: + return + email = email.replace('mailto:', '').strip() + if dataset.organization: + contact_point = ContactPoint.objects( + name=name, email=email, organization=dataset.organization).first() + return (contact_point or + ContactPoint(name=name, email=email, organization=dataset.organization).save()) + elif dataset.owner: + contact_point = ContactPoint.objects( + name=name, email=email, owner=dataset.owner).first() + return (contact_point or + ContactPoint(name=name, email=email, owner=dataset.owner).save()) def schema_from_rdf(rdf): ''' From ce30a60227d024df964530b1a41319879acaed9d Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Thu, 16 May 2024 14:42:12 +0200 Subject: [PATCH 26/60] Revert hvd labels from rebase --- udata/core/dataset/rdf.py | 23 +---------------------- udata/rdf.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index e80c7a03f2..e9c53961bd 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -22,7 +22,7 @@ from udata.models import db, ContactPoint from udata.rdf import ( DCAT, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, VCARD, RDFS, contact_point_from_rdf, - namespace_manager, rdf_value, sanitize_html, schema_from_rdf, theme_labels_from_rdf, themes_from_rdf, url_from_rdf + namespace_manager, rdf_value, sanitize_html, schema_from_rdf, themes_from_rdf, url_from_rdf ) from udata.utils import get_by, safe_unicode from udata.uris import endpoint_for @@ -415,27 +415,6 @@ def remote_url_from_rdf(rdf): except uris.ValidationError: pass - -def theme_labels_from_rdf(rdf): - ''' - Get theme labels to use as keywords. - Map HVD keywords from known URIs resources if HVD support is activated. - ''' - for theme in rdf.objects(DCAT.theme): - if isinstance(theme, RdfResource): - uri = theme.identifier.toPython() - if current_app.config['HVD_SUPPORT'] and uri in EU_HVD_CATEGORIES: - label = EU_HVD_CATEGORIES[uri] - # Additionnally yield hvd keyword - yield 'hvd' - else: - label = rdf_value(theme, SKOS.prefLabel) - else: - label = theme.toPython() - if label: - yield label - - def resource_from_rdf(graph_or_distrib, dataset=None, is_additionnal=False): ''' Map a Resource domain model to a DCAT/RDF graph diff --git a/udata/rdf.py b/udata/rdf.py index 57d0de9077..d0096115f4 100644 --- a/udata/rdf.py +++ b/udata/rdf.py @@ -264,9 +264,19 @@ def url_from_rdf(rdf, prop): return value.identifier.toPython() def theme_labels_from_rdf(rdf): + ''' + Get theme labels to use as keywords. + Map HVD keywords from known URIs resources if HVD support is activated. + ''' for theme in rdf.objects(DCAT.theme): if isinstance(theme, RdfResource): - label = rdf_value(theme, SKOS.prefLabel) + uri = theme.identifier.toPython() + if current_app.config['HVD_SUPPORT'] and uri in EU_HVD_CATEGORIES: + label = EU_HVD_CATEGORIES[uri] + # Additionnally yield hvd keyword + yield 'hvd' + else: + label = rdf_value(theme, SKOS.prefLabel) else: label = theme.toPython() if label: From 3e4d901775a6709a5057abde6fd071bf4a805d7f Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Thu, 16 May 2024 15:01:28 +0200 Subject: [PATCH 27/60] Fix tests --- udata/core/dataset/rdf.py | 11 ---- udata/harvest/backends/base.py | 96 +++++++++++++++++++++++++++++++++- udata/harvest/backends/dcat.py | 25 ++++++--- udata/harvest/models.py | 2 + udata/rdf.py | 11 +++- 5 files changed, 126 insertions(+), 19 deletions(-) diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index e9c53961bd..3bb900dc62 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -75,17 +75,6 @@ EUFREQ.NEVER: 'punctual', } -# Map High Value Datasets URIs to keyword categories -EU_HVD_CATEGORIES = { - "http://data.europa.eu/bna/c_164e0bf5": "Météorologiques", - "http://data.europa.eu/bna/c_a9135398": "Entreprises et propriété d'entreprises", - "http://data.europa.eu/bna/c_ac64a52d": "Géospatiales", - "http://data.europa.eu/bna/c_b79e35eb": "Mobilité", - "http://data.europa.eu/bna/c_dd313021": "Observation de la terre et environnement", - "http://data.europa.eu/bna/c_e1da4e07": "Statistiques" -} - - def temporal_to_rdf(daterange, graph=None): if not daterange: return diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 60eb5447a7..f57f792d90 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -8,6 +8,7 @@ import requests from flask import current_app +from udata.core.dataservices.models import Dataservice from voluptuous import MultipleInvalid, RequiredFieldInvalid from udata.core.dataset.models import HarvestDatasetMetadata @@ -153,6 +154,9 @@ def inner_harvest(self): def inner_process_dataset(self, item: HarvestItem) -> Dataset: raise NotImplementedError + def inner_process_dataservice(self, item: HarvestItem) -> Dataservice: + raise NotImplementedError + def harvest(self): log.debug(f'Starting harvesting {self.source.name} ({self.source.url})…') factory = HarvestJob if self.dryrun else HarvestJob.objects.create @@ -238,6 +242,52 @@ def is_done(self) -> bool: '''Should be called after process_dataset to know if we reach the max items''' return self.max_items and len(self.job.items) >= self.max_items + def process_dataservice(self, remote_id: str, **kwargs) -> bool : + ''' + Return `True` if the parent should stop iterating because we exceed the number + of items to process. + ''' + log.debug(f'Processing dataservice {remote_id}…') + + # TODO add `type` to `HarvestItem` to differentiate `Dataset` from `Dataservice` + item = HarvestItem(status='started', started=datetime.utcnow(), remote_id=remote_id) + self.job.items.append(item) + self.save_job() + + try: + dataservice = self.inner_process_dataservice(item, **kwargs) + + dataservice.harvest = self.update_harvest_info(dataservice.harvest, remote_id) + dataservice.archived = None + + # TODO: Apply editable mappings + + if self.dryrun: + dataservice.validate() + else: + dataservice.save() + item.dataservice = dataservice + item.status = 'done' + except HarvestSkipException as e: + item.status = 'skipped' + + log.info(f'Skipped item {item.remote_id} : {safe_unicode(e)}') + item.errors.append(HarvestError(message=safe_unicode(e))) + except HarvestValidationError as e: + item.status = 'failed' + + log.info(f'Error validating item {item.remote_id} : {safe_unicode(e)}') + item.errors.append(HarvestError(message=safe_unicode(e))) + except Exception as e: + item.status = 'failed' + log.exception(f'Error while processing {item.remote_id} : {safe_unicode(e)}') + + error = HarvestError(message=safe_unicode(e), details=traceback.format_exc()) + item.errors.append(error) + finally: + item.ended = datetime.utcnow() + self.save_job() + def update_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], remote_id: int): if not harvest: harvest = HarvestDatasetMetadata() @@ -313,6 +363,50 @@ def get_dataset(self, remote_id): return Dataset(owner=self.source.owner) return Dataset() + + def get_dataservice(self, remote_id): + '''Get or create a dataservice given its remote ID (and its source) + We first try to match `source_id` to be source domain independent + ''' + dataservice = Dataservice.objects(__raw__={ + 'harvest.remote_id': remote_id, + '$or': [ + {'harvest.domain': self.source.domain}, + {'harvest.source_id': str(self.source.id)}, + ], + }).first() + + if dataservice: + return dataservice + + if self.source.organization: + return Dataservice(organization=self.source.organization) + elif self.source.owner: + return Dataservice(owner=self.source.owner) + + return Dataservice() + + def get_dataservice(self, remote_id): + '''Get or create a dataservice given its remote ID (and its source) + We first try to match `source_id` to be source domain independent + ''' + dataservice = Dataservice.objects(__raw__={ + 'harvest.remote_id': remote_id, + '$or': [ + {'harvest.domain': self.source.domain}, + {'harvest.source_id': str(self.source.id)}, + ], + }).first() + + if dataservice: + return dataservice + + if self.source.organization: + return Dataservice(organization=self.source.organization) + elif self.source.owner: + return Dataservice(owner=self.source.owner) + + return Dataservice() def validate(self, data, schema): '''Perform a data validation against a given schema. @@ -353,4 +447,4 @@ def validate(self, data, schema): msg = str(error) errors.append(msg) msg = '\n- '.join(['Validation error:'] + errors) - raise HarvestValidationError(msg) \ No newline at end of file + raise HarvestValidationError(msg) diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index 573c028cc1..b8a5afd069 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -1,19 +1,17 @@ import logging -from rdflib import Graph, URIRef +from rdflib import Graph from rdflib.namespace import RDF import lxml.etree as ET -import boto3 from flask import current_app from datetime import date -import json -from typing import Generator, List +from typing import Generator -from udata.core.dataset.models import Dataset from udata.rdf import ( DCAT, DCT, HYDRA, SPDX, namespace_manager, guess_format, url_from_rdf ) from udata.core.dataset.rdf import dataset_from_rdf +from udata.core.dataservices.rdf import dataservice_from_rdf from udata.storage.s3 import store_as_json, get_from_json from udata.harvest.models import HarvestItem @@ -71,7 +69,8 @@ def inner_harvest(self): self.process_one_datasets_page(page_number, page) serialized_graphs.append(page.serialize(format=fmt, indent=None)) - # TODO call `walk_graph` with `process_dataservices` + for page_number, page in self.walk_graph(self.source.url, fmt): + self.process_one_dataservices_page(page_number, page) # The official MongoDB document size in 16MB. The default value here is 15MB to account for other fields in the document (and for difference between * 1024 vs * 1000). max_harvest_graph_size_in_mongo = current_app.config.get('HARVEST_MAX_CATALOG_SIZE_IN_MONGO') @@ -152,6 +151,14 @@ def process_one_datasets_page(self, page_number: int, page: Graph): if self.is_done(): return + + def process_one_dataservices_page(self, page_number: int, page: Graph): + for node in page.subjects(RDF.type, DCAT.DataService): + remote_id = page.value(node, DCT.identifier) + self.process_dataservice(remote_id, page_number=page_number, page=page, node=node) + + if self.is_done(): + return def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph, node): item.kwargs['page_number'] = page_number @@ -159,6 +166,12 @@ def inner_process_dataset(self, item: HarvestItem, page_number: int, page: Graph dataset = self.get_dataset(item.remote_id) return dataset_from_rdf(page, dataset, node=node) + def inner_process_dataservice(self, item: HarvestItem, page_number: int, page: Graph, node): + item.kwargs['page_number'] = page_number + + dataservice = self.get_dataservice(item.remote_id) + return dataservice_from_rdf(page, dataservice, node=node) + def get_node_from_item(self, graph, item): for node in graph.subjects(RDF.type, DCAT.Dataset): if str(graph.value(node, DCT.identifier)) == item.remote_id: diff --git a/udata/harvest/models.py b/udata/harvest/models.py index 032f5f09de..e1676e4b92 100644 --- a/udata/harvest/models.py +++ b/udata/harvest/models.py @@ -3,6 +3,7 @@ import logging from urllib.parse import urlparse +from udata.core.dataservices.models import Dataservice from werkzeug.utils import cached_property from udata.core.dataset.models import HarvestDatasetMetadata @@ -53,6 +54,7 @@ class HarvestError(db.EmbeddedDocument): class HarvestItem(db.EmbeddedDocument): remote_id = db.StringField() dataset = db.ReferenceField(Dataset) + dataservice = db.ReferenceField(Dataservice) status = db.StringField(choices=list(HARVEST_ITEM_STATUS), default=DEFAULT_HARVEST_ITEM_STATUS, required=True) created = db.DateTimeField(default=datetime.utcnow, required=True) diff --git a/udata/rdf.py b/udata/rdf.py index d0096115f4..9fa38f87e8 100644 --- a/udata/rdf.py +++ b/udata/rdf.py @@ -5,7 +5,7 @@ import logging import re -from flask import request, url_for, abort +from flask import request, url_for, abort, current_app from rdflib import Graph, Literal, URIRef from rdflib.resource import Resource as RdfResource @@ -101,6 +101,15 @@ # Includes control characters, unicode surrogate characters and unicode end-of-plane non-characters ILLEGAL_XML_CHARS = '[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]' +# Map High Value Datasets URIs to keyword categories +EU_HVD_CATEGORIES = { + "http://data.europa.eu/bna/c_164e0bf5": "Météorologiques", + "http://data.europa.eu/bna/c_a9135398": "Entreprises et propriété d'entreprises", + "http://data.europa.eu/bna/c_ac64a52d": "Géospatiales", + "http://data.europa.eu/bna/c_b79e35eb": "Mobilité", + "http://data.europa.eu/bna/c_dd313021": "Observation de la terre et environnement", + "http://data.europa.eu/bna/c_e1da4e07": "Statistiques" +} def guess_format(string): '''Guess format given an extension or a mime-type''' From ae02115048acf112163f234ed3b6beacf6263d80 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Mon, 27 May 2024 16:06:52 +0200 Subject: [PATCH 28/60] Remove duplicate code --- udata/harvest/backends/base.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index f57f792d90..fb94cf8afe 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -364,28 +364,6 @@ def get_dataset(self, remote_id): return Dataset() - def get_dataservice(self, remote_id): - '''Get or create a dataservice given its remote ID (and its source) - We first try to match `source_id` to be source domain independent - ''' - dataservice = Dataservice.objects(__raw__={ - 'harvest.remote_id': remote_id, - '$or': [ - {'harvest.domain': self.source.domain}, - {'harvest.source_id': str(self.source.id)}, - ], - }).first() - - if dataservice: - return dataservice - - if self.source.organization: - return Dataservice(organization=self.source.organization) - elif self.source.owner: - return Dataservice(owner=self.source.owner) - - return Dataservice() - def get_dataservice(self, remote_id): '''Get or create a dataservice given its remote ID (and its source) We first try to match `source_id` to be source domain independent From 345f05b7080974991d8a829438b1a39c1a5fd510 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 29 May 2024 16:43:48 +0200 Subject: [PATCH 29/60] wip add printing graphs --- udata/core/dataservices/rdf.py | 15 +++++++++++++- udata/core/dataset/rdf.py | 7 ++++++- udata/harvest/backends/dcat.py | 4 ++++ udata/rdf.py | 38 ++++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 2 deletions(-) diff --git a/udata/core/dataservices/rdf.py b/udata/core/dataservices/rdf.py index 74fb4620c9..abc755f800 100644 --- a/udata/core/dataservices/rdf.py +++ b/udata/core/dataservices/rdf.py @@ -7,7 +7,7 @@ from udata.core.dataset.models import License from udata.core.dataset.rdf import sanitize_html from udata.harvest.models import HarvestSource -from udata.rdf import DCAT, DCT, contact_point_from_rdf, rdf_value, theme_labels_from_rdf, themes_from_rdf, url_from_rdf +from udata.rdf import DCAT, DCT, contact_point_from_rdf, print_graph, rdf_value, theme_labels_from_rdf, themes_from_rdf, url_from_rdf def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None, node=None, source: Optional[HarvestSource] = None): @@ -21,6 +21,8 @@ def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None d = graph.resource(node) + print_graph(graph, d) + dataservice.title = rdf_value(d, DCT.title) dataservice.description = sanitize_html(d.value(DCT.description) or d.value(DCT.abstract)) @@ -29,6 +31,17 @@ def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None dataservice.contact_point = contact_point_from_rdf(d, dataservice) or dataservice.contact_point + print('----') + print('----') + print('----') + for test in d.objects(DCAT.servesDataset): + print(test.value(DCT.identifier)) + # for x, y, z in test: + # print(x, y, z) + print('----') + print('----') + print('----') + license = rdf_value(d, DCT.license) if license is not None: dataservice.license = License.guess(license) diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index 3bb900dc62..11b473147d 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -22,7 +22,7 @@ from udata.models import db, ContactPoint from udata.rdf import ( DCAT, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, VCARD, RDFS, contact_point_from_rdf, - namespace_manager, rdf_value, sanitize_html, schema_from_rdf, themes_from_rdf, url_from_rdf + namespace_manager, print_graph, rdf_value, sanitize_html, schema_from_rdf, themes_from_rdf, url_from_rdf ) from udata.utils import get_by, safe_unicode from udata.uris import endpoint_for @@ -480,7 +480,12 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None): d = graph.resource(node) + print_graph(graph, d) + dataset.title = rdf_value(d, DCT.title) + if not dataset.title: + return None + # Support dct:abstract if dct:description is missing (sometimes used instead) description = d.value(DCT.description) or d.value(DCT.abstract) dataset.description = sanitize_html(description) diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index b8a5afd069..81528fdc19 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -315,6 +315,10 @@ def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, N dc:type dataset + + dc:type + service + dc:type series diff --git a/udata/rdf.py b/udata/rdf.py index 9fa38f87e8..21659ac69a 100644 --- a/udata/rdf.py +++ b/udata/rdf.py @@ -415,3 +415,41 @@ def graph_response(graph, format): if isinstance(graph, RdfResource): graph = graph.graph return escape_xml_illegal_chars(graph.serialize(format=fmt, **kwargs)), 200, headers + +def print_graph(graph: Graph, node: Graph): + for x, y, z in node: + print_node(graph, y, z) + +def print_node(graph: Graph, predicate, objects, level = 0, printed_ids = None): + if printed_ids is None: + printed_ids = set() + + nm = graph.namespace_manager + indent = level * 4 * ' ' + inner_indent = (level + 1) * 4 * ' ' + + print(f"{indent}<{predicate.identifier.n3(nm)}>") + if isinstance(objects, Literal): + if objects.toPython(): + print(f"{inner_indent}{objects.toPython()}", end='') + else: + print(f"**empty literal**") + if (objects.language): + print(f" (lang={objects.language})", end='') + if (objects.datatype): + print(f" (datatype={objects.datatype})", end='') + print('') + elif isinstance(objects, RdfResource): + print(f"{inner_indent}Resource ID: {objects.identifier.toPython()}") + if objects.identifier.toPython() not in printed_ids: + printed_ids.add(objects.identifier.toPython()) + for x, y, z in objects: + print_node(graph, y, z, level + 1, printed_ids=printed_ids) + else: + print(f"{inner_indent}Skipping resource because already printed.") + else: + print(objects.__class__) + for x, y, z in objects: + print_node(graph, y, z, level + 1, printed_ids=printed_ids) + print(f"{indent}") + From 735f6d5620be9b238526f45ba96101eecc4a8b44 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 29 May 2024 16:45:17 +0200 Subject: [PATCH 30/60] Remove duplicate harvest update info --- udata/core/dataservices/rdf.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/udata/core/dataservices/rdf.py b/udata/core/dataservices/rdf.py index abc755f800..d73bd5664a 100644 --- a/udata/core/dataservices/rdf.py +++ b/udata/core/dataservices/rdf.py @@ -51,15 +51,4 @@ def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None dataservice.tags = themes_from_rdf(d) - if not dataservice.harvest: - dataservice.harvest = HarvestMetadata() - - if source is not None: - dataservice.harvest.backend = source.backend - - dataservice.harvest.source_id = source.id - dataservice.harvest.source_url = source.url - - dataservice.harvest.last_harvested_at = datetime.utcnow() - return dataservice \ No newline at end of file From b83d6974dfae9505e069a3fbb3a456d63287fff3 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Thu, 30 May 2024 08:59:21 +0200 Subject: [PATCH 31/60] Add link between dataservices and datasets --- udata/core/dataservices/rdf.py | 33 ++++++++++++++++----------------- udata/core/dataset/rdf.py | 7 +++---- udata/harvest/backends/dcat.py | 2 +- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/udata/core/dataservices/rdf.py b/udata/core/dataservices/rdf.py index d73bd5664a..8e16aa9d6c 100644 --- a/udata/core/dataservices/rdf.py +++ b/udata/core/dataservices/rdf.py @@ -1,28 +1,24 @@ from datetime import datetime -from typing import Optional +from typing import List, Optional from rdflib import RDF, Graph from udata.core.dataservices.models import Dataservice, HarvestMetadata -from udata.core.dataset.models import License +from udata.core.dataset.models import Dataset, License from udata.core.dataset.rdf import sanitize_html from udata.harvest.models import HarvestSource from udata.rdf import DCAT, DCT, contact_point_from_rdf, print_graph, rdf_value, theme_labels_from_rdf, themes_from_rdf, url_from_rdf -def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None, node=None, source: Optional[HarvestSource] = None): +def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, datasets: List[Dataset]) -> Dataservice : ''' Create or update a dataset from a RDF/DCAT graph ''' - dataservice = dataservice or Dataservice() - if node is None: # Assume first match is the only match node = graph.value(predicate=RDF.type, object=DCAT.DataService) d = graph.resource(node) - print_graph(graph, d) - dataservice.title = rdf_value(d, DCT.title) dataservice.description = sanitize_html(d.value(DCT.description) or d.value(DCT.abstract)) @@ -31,16 +27,19 @@ def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None dataservice.contact_point = contact_point_from_rdf(d, dataservice) or dataservice.contact_point - print('----') - print('----') - print('----') - for test in d.objects(DCAT.servesDataset): - print(test.value(DCT.identifier)) - # for x, y, z in test: - # print(x, y, z) - print('----') - print('----') - print('----') + for dataset_node in d.objects(DCAT.servesDataset): + id = dataset_node.value(DCT.identifier) + dataset = next((d for d in datasets if d is not None and d.harvest.remote_id == id.toPython()), None) + print('---') + print(id) + print_graph(graph, dataset_node) + print('---') + for d in datasets: + if d is not None: + print(d.harvest.remote_id) + if dataset is not None: + print('found dataset!') + dataservice.datasets.append(dataset.id) license = rdf_value(d, DCT.license) if license is not None: diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index 11b473147d..be8fac72fb 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -19,10 +19,11 @@ from udata import i18n, uris from udata.core.spatial.models import SpatialCoverage from udata.core.dataset.models import HarvestDatasetMetadata, HarvestResourceMetadata +from udata.harvest.exceptions import HarvestSkipException from udata.models import db, ContactPoint from udata.rdf import ( DCAT, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, VCARD, RDFS, contact_point_from_rdf, - namespace_manager, print_graph, rdf_value, sanitize_html, schema_from_rdf, themes_from_rdf, url_from_rdf + namespace_manager, rdf_value, sanitize_html, schema_from_rdf, themes_from_rdf, url_from_rdf ) from udata.utils import get_by, safe_unicode from udata.uris import endpoint_for @@ -480,11 +481,9 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None): d = graph.resource(node) - print_graph(graph, d) - dataset.title = rdf_value(d, DCT.title) if not dataset.title: - return None + raise HarvestSkipException # Support dct:abstract if dct:description is missing (sometimes used instead) description = d.value(DCT.description) or d.value(DCT.abstract) diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index 81528fdc19..028c22d6ee 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -170,7 +170,7 @@ def inner_process_dataservice(self, item: HarvestItem, page_number: int, page: G item.kwargs['page_number'] = page_number dataservice = self.get_dataservice(item.remote_id) - return dataservice_from_rdf(page, dataservice, node=node) + return dataservice_from_rdf(page, dataservice, node, [item.dataset for item in self.job.items]) def get_node_from_item(self, graph, item): for node in graph.subjects(RDF.type, DCAT.Dataset): From 29cede640f137d119b83d5cceafc3a5f96b499dc Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Thu, 30 May 2024 16:11:16 +0200 Subject: [PATCH 32/60] link dataset with dataservices --- udata/core/dataservices/rdf.py | 17 +++++++-------- udata/harvest/backends/base.py | 9 ++++---- udata/harvest/backends/dcat.py | 5 +++++ udata/rdf.py | 38 ---------------------------------- 4 files changed, 17 insertions(+), 52 deletions(-) diff --git a/udata/core/dataservices/rdf.py b/udata/core/dataservices/rdf.py index 8e16aa9d6c..7e9780573e 100644 --- a/udata/core/dataservices/rdf.py +++ b/udata/core/dataservices/rdf.py @@ -7,7 +7,7 @@ from udata.core.dataset.models import Dataset, License from udata.core.dataset.rdf import sanitize_html from udata.harvest.models import HarvestSource -from udata.rdf import DCAT, DCT, contact_point_from_rdf, print_graph, rdf_value, theme_labels_from_rdf, themes_from_rdf, url_from_rdf +from udata.rdf import DCAT, DCT, contact_point_from_rdf, rdf_value, theme_labels_from_rdf, themes_from_rdf, url_from_rdf def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, datasets: List[Dataset]) -> Dataservice : @@ -29,16 +29,13 @@ def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, datasets: for dataset_node in d.objects(DCAT.servesDataset): id = dataset_node.value(DCT.identifier) - dataset = next((d for d in datasets if d is not None and d.harvest.remote_id == id.toPython()), None) - print('---') - print(id) - print_graph(graph, dataset_node) - print('---') - for d in datasets: - if d is not None: - print(d.harvest.remote_id) + dataset = next((d for d in datasets if d is not None and d.harvest.remote_id == id), None) + + if dataset is None: + # We try with `endswith` because Europe XSLT have problems with IDs. Sometimes they are prefixed with the domain of the catalog, sometimes not. + dataset = next((d for d in datasets if d is not None and d.harvest.remote_id.endswith(id)), None) + if dataset is not None: - print('found dataset!') dataservice.datasets.append(dataset.id) license = rdf_value(d, DCT.license) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index fb94cf8afe..47ed494cf2 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -12,6 +12,7 @@ from voluptuous import MultipleInvalid, RequiredFieldInvalid from udata.core.dataset.models import HarvestDatasetMetadata +from udata.core.dataservices.models import HarvestMetadata as HarvestDataserviceMetadata from udata.models import Dataset from udata.utils import safe_unicode @@ -207,7 +208,7 @@ def process_dataset(self, remote_id: str, **kwargs): dataset = self.inner_process_dataset(item, **kwargs) # Use `item.remote_id` because `inner_process_dataset` could have modified it. - dataset.harvest = self.update_harvest_info(dataset.harvest, item.remote_id) + dataset.harvest = self.update_harvest_info(HarvestDatasetMetadata, dataset.harvest, item.remote_id) dataset.archived = None # TODO: Apply editable mappings @@ -257,7 +258,7 @@ def process_dataservice(self, remote_id: str, **kwargs) -> bool : try: dataservice = self.inner_process_dataservice(item, **kwargs) - dataservice.harvest = self.update_harvest_info(dataservice.harvest, remote_id) + dataservice.harvest = self.update_harvest_info(HarvestDataserviceMetadata, dataservice.harvest, remote_id) dataservice.archived = None # TODO: Apply editable mappings @@ -288,9 +289,9 @@ def process_dataservice(self, remote_id: str, **kwargs) -> bool : item.ended = datetime.utcnow() self.save_job() - def update_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], remote_id: int): + def update_harvest_info(self, default, harvest: Optional[HarvestDatasetMetadata], remote_id: int): if not harvest: - harvest = HarvestDatasetMetadata() + harvest = default() harvest.domain = self.source.domain harvest.remote_id = remote_id harvest.source_id = str(self.source.id) diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index 028c22d6ee..033809c080 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -155,6 +155,10 @@ def process_one_datasets_page(self, page_number: int, page: Graph): def process_one_dataservices_page(self, page_number: int, page: Graph): for node in page.subjects(RDF.type, DCAT.DataService): remote_id = page.value(node, DCT.identifier) + if not remote_id: + log.warning(f"Skipping dataservice because no `remote_id`") + continue + self.process_dataservice(remote_id, page_number=page_number, page=page, node=node) if self.is_done(): @@ -337,6 +341,7 @@ def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, N headers=headers) response.raise_for_status() + tree_before_transform = ET.fromstring(response.content) # Disabling CoupledResourceLookUp to prevent failure on xlink:href # https://github.com/SEMICeu/iso-19139-to-dcat-ap/blob/master/documentation/HowTo.md#parameter-coupledresourcelookup diff --git a/udata/rdf.py b/udata/rdf.py index 21659ac69a..9fa38f87e8 100644 --- a/udata/rdf.py +++ b/udata/rdf.py @@ -415,41 +415,3 @@ def graph_response(graph, format): if isinstance(graph, RdfResource): graph = graph.graph return escape_xml_illegal_chars(graph.serialize(format=fmt, **kwargs)), 200, headers - -def print_graph(graph: Graph, node: Graph): - for x, y, z in node: - print_node(graph, y, z) - -def print_node(graph: Graph, predicate, objects, level = 0, printed_ids = None): - if printed_ids is None: - printed_ids = set() - - nm = graph.namespace_manager - indent = level * 4 * ' ' - inner_indent = (level + 1) * 4 * ' ' - - print(f"{indent}<{predicate.identifier.n3(nm)}>") - if isinstance(objects, Literal): - if objects.toPython(): - print(f"{inner_indent}{objects.toPython()}", end='') - else: - print(f"**empty literal**") - if (objects.language): - print(f" (lang={objects.language})", end='') - if (objects.datatype): - print(f" (datatype={objects.datatype})", end='') - print('') - elif isinstance(objects, RdfResource): - print(f"{inner_indent}Resource ID: {objects.identifier.toPython()}") - if objects.identifier.toPython() not in printed_ids: - printed_ids.add(objects.identifier.toPython()) - for x, y, z in objects: - print_node(graph, y, z, level + 1, printed_ids=printed_ids) - else: - print(f"{inner_indent}Skipping resource because already printed.") - else: - print(objects.__class__) - for x, y, z in objects: - print_node(graph, y, z, level + 1, printed_ids=printed_ids) - print(f"{indent}") - From 2c7a24d25fa2cbac9d061a606654475870f1af70 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Mon, 3 Jun 2024 12:46:54 +0200 Subject: [PATCH 33/60] Use forked XSLT --- udata/harvest/backends/dcat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index 033809c080..fa06d3e0c2 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -285,7 +285,7 @@ class CswIso19139DcatBackend(DcatBackend): ISO_SCHEMA = 'http://www.isotc211.org/2005/gmd' - XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl" + XSL_URL = "https://raw.githubusercontent.com/datagouv/iso-19139-to-dcat-ap/empty_function_to_distribution/iso-19139-to-dcat-ap.xsl" def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]: """ From aa125aaf1becd2e3c750dbf824e597e5d535a893 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 4 Jun 2024 13:02:03 +0200 Subject: [PATCH 34/60] fix merge issues --- udata/harvest/backends/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 785f243422..43e9c2cdd6 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -209,8 +209,6 @@ def process_dataset(self, remote_id: str, **kwargs): self.save_job() try: - # Use `item.remote_id` because `inner_process_dataset` could have modified it. - dataset.harvest = self.update_harvest_info(HarvestDatasetMetadata, dataset.harvest, item.remote_id) if not remote_id: raise HarvestSkipException("missing identifier") @@ -265,6 +263,9 @@ def process_dataservice(self, remote_id: str, **kwargs) -> bool : self.save_job() try: + if not remote_id: + raise HarvestSkipException("missing identifier") + dataservice = self.inner_process_dataservice(item, **kwargs) dataservice.harvest = self.update_harvest_info(HarvestDataserviceMetadata, dataservice.harvest, remote_id) From d667b7bcc89c322607560f5a631eb203cee97c29 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 4 Jun 2024 13:50:35 +0200 Subject: [PATCH 35/60] Fix tests --- udata/harvest/tests/dcat/bnodes.xml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/udata/harvest/tests/dcat/bnodes.xml b/udata/harvest/tests/dcat/bnodes.xml index 892f6763dd..b32be46470 100644 --- a/udata/harvest/tests/dcat/bnodes.xml +++ b/udata/harvest/tests/dcat/bnodes.xml @@ -13,7 +13,7 @@ > - + Dataset 3 2016-12-14T19:01:24.184120 1.0 @@ -72,7 +72,7 @@ - + Tag 1 2016-12-14T19:01:24.184120 @@ -105,12 +105,8 @@ - - - - - - + + 2016-12-15T09:19:51.723691 From e6816c98eb61ae816337c1a65da791661fc24ea1 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 4 Jun 2024 16:26:43 +0200 Subject: [PATCH 36/60] Improve some skips --- udata/core/dataset/rdf.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index be8fac72fb..0c331dd35c 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -426,6 +426,12 @@ def resource_from_rdf(graph_or_distrib, dataset=None, is_additionnal=False): if not url: log.warning(f'Resource without url: {distrib}') return + + try: + url = uris.validate(url) + except uris.ValidationError: + log.warning(f'Resource with invalid url: {url}') + return if dataset: resource = get_by(dataset.resources, 'url', url) @@ -483,7 +489,7 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None): dataset.title = rdf_value(d, DCT.title) if not dataset.title: - raise HarvestSkipException + raise HarvestSkipException("missing title") # Support dct:abstract if dct:description is missing (sometimes used instead) description = d.value(DCT.description) or d.value(DCT.abstract) From 4eec493ededffdfdbf07d0aeb89ac1dc6d920eca Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 4 Jun 2024 18:09:41 +0200 Subject: [PATCH 37/60] Remove merge tag --- udata/harvest/backends/base.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 43e9c2cdd6..383b97b1b2 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -74,26 +74,6 @@ class BaseBackend(object): """ Base class that wrap children methods to add error management and debug logs. Also provides a few helpers needed on all or some backends. - -<<<<<<< HEAD - The flow is the following: - Parent Child - - harvest -> inner_harvest() - / - process_dataset (create HarvestItem) <------ - \ - --------> inner_process_dataset() (call get_dataset() and update object) - - - process_dataset: - 1. Create HarvestItem - 2. Call inner_process_dataset(item) - 3. Save HarvestItem (dryrun) - 4. Save dataset (dryrun) -======= - ->>>>>>> master """ name = None From f5c6917394e0cd0501a0304e0230fee60472a12d Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 5 Jun 2024 10:13:11 +0200 Subject: [PATCH 38/60] Separate metadata --- udata/core/dataservices/models.py | 7 +++--- udata/harvest/backends/base.py | 37 ++++++++++++++++++++++++------- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/udata/core/dataservices/models.py b/udata/core/dataservices/models.py index 7613481d42..8ce723e48a 100644 --- a/udata/core/dataservices/models.py +++ b/udata/core/dataservices/models.py @@ -33,16 +33,15 @@ def hidden(self): class HarvestMetadata(db.DynamicEmbeddedDocument): backend = db.StringField() + domain = db.StringField() source_id = db.StringField() - source_url = db.StringField() + source_url = db.URLField() - dct_identifier = db.StringField() - remote_id = db.StringField() - remote_url = db.URLField() last_harvested_at = db.DateTimeField() + archived_at = db.DateTimeField() @generate_fields() class Dataservice(WithMetrics, Owned, db.Document): diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 383b97b1b2..52930f4457 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -195,7 +195,7 @@ def process_dataset(self, remote_id: str, **kwargs): dataset = self.inner_process_dataset(item, **kwargs) # Use `item.remote_id` because `inner_process_dataset` could have modified it. - dataset.harvest = self.update_harvest_info(HarvestDatasetMetadata, dataset.harvest, item.remote_id) + dataset.harvest = self.update_dataset_harvest_info(dataset.harvest, item.remote_id) dataset.archived = None # TODO: Apply editable mappings @@ -248,8 +248,8 @@ def process_dataservice(self, remote_id: str, **kwargs) -> bool : dataservice = self.inner_process_dataservice(item, **kwargs) - dataservice.harvest = self.update_harvest_info(HarvestDataserviceMetadata, dataservice.harvest, remote_id) - dataservice.archived = None + dataservice.harvest = self.update_dataservice_harvest_info(dataservice.harvest, remote_id) + dataservice.archived_at = None # TODO: Apply editable mappings @@ -279,18 +279,39 @@ def process_dataservice(self, remote_id: str, **kwargs) -> bool : item.ended = datetime.utcnow() self.save_job() - def update_harvest_info(self, default, harvest: Optional[HarvestDatasetMetadata], remote_id: int): + def update_dataset_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], remote_id: int): if not harvest: - harvest = default() + harvest = HarvestDatasetMetadata() - harvest.domain = self.source.domain - harvest.remote_id = remote_id + harvest.backend = self.display_name + # created_at + # modified_at harvest.source_id = str(self.source.id) + harvest.remote_id = remote_id + harvest.domain = self.source.domain harvest.last_update = datetime.utcnow() + # remote_url + # uri + # dct_identifier + harvest.archived_at = None + harvest.archived = None + + return harvest + + def update_dataservice_harvest_info(self, harvest: Optional[HarvestDataserviceMetadata], remote_id: int): + if not harvest: + harvest = HarvestDataserviceMetadata() + harvest.backend = self.display_name + harvest.domain = self.source.domain + + harvest.source_id = str(self.source.id) + harvest.source_id = str(self.source.url) + + harvest.remote_id = remote_id + harvest.last_harvested_at = datetime.utcnow() harvest.archived_at = None - harvest.archived = None return harvest From c9b99404ab6613686d55657c6fe142fcdfeef650 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 5 Jun 2024 10:16:22 +0200 Subject: [PATCH 39/60] Add comment --- udata/harvest/backends/base.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 52930f4457..9b3ee0735c 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -284,18 +284,15 @@ def update_dataset_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], harvest = HarvestDatasetMetadata() harvest.backend = self.display_name - # created_at - # modified_at harvest.source_id = str(self.source.id) harvest.remote_id = remote_id harvest.domain = self.source.domain harvest.last_update = datetime.utcnow() - # remote_url - # uri - # dct_identifier harvest.archived_at = None harvest.archived = None + # created_at, modified_at, remote_url, uri, dct_identifier are set in `dataset_from_rdf` + return harvest def update_dataservice_harvest_info(self, harvest: Optional[HarvestDataserviceMetadata], remote_id: int): From 69ec78451ab667894e2e93b5e2c62948287a980e Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 5 Jun 2024 10:30:37 +0200 Subject: [PATCH 40/60] Add dataservice card to harvester admin --- js/components/harvest/item.vue | 13 +++++++++++++ udata/core/dataservices/models.py | 5 +++-- udata/harvest/api.py | 4 ++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/js/components/harvest/item.vue b/js/components/harvest/item.vue index 54a72b441f..20df42808c 100644 --- a/js/components/harvest/item.vue +++ b/js/components/harvest/item.vue @@ -34,6 +34,19 @@ :dataset="item.dataset"> +
{{ _('Dataservice') }}
+
+
+
+

+ + {{ item.dataservice.title | truncate 80 }} + +
{{{ item.dataservice.description | markdown 180 }}}
+

+
+
+
{{ _('Errors') }}
diff --git a/udata/core/dataservices/models.py b/udata/core/dataservices/models.py index 8ce723e48a..bd123c6b01 100644 --- a/udata/core/dataservices/models.py +++ b/udata/core/dataservices/models.py @@ -134,8 +134,9 @@ class Dataservice(WithMetrics, Owned, db.Document): def self_api_url(self): return endpoint_for('api.dataservice', dataservice=self, _external=True) - def self_web_url(): - pass + @function_field(description="Link to the udata web page for this dataservice") + def self_web_url(self): + return endpoint_for('dataservices.show', dataservice=self) # TODO # frequency = db.StringField(choices=list(UPDATE_FREQUENCIES.keys())) diff --git a/udata/harvest/api.py b/udata/harvest/api.py index 1959545dff..b82327c492 100644 --- a/udata/harvest/api.py +++ b/udata/harvest/api.py @@ -4,6 +4,7 @@ from udata.api import api, API, fields from udata.auth import admin_permission +from udata.core.dataservices.models import Dataservice from udata.core.dataset.api_fields import dataset_ref_fields, dataset_fields from udata.core.organization.api_fields import org_ref_fields from udata.core.organization.permissions import EditOrganizationPermission @@ -37,6 +38,9 @@ def backends_ids(): 'dataset': fields.Nested(dataset_ref_fields, description='The processed dataset', allow_null=True), + 'dataservice': fields.Nested(Dataservice.__read_fields__, + description='The processed dataservice', + allow_null=True), 'status': fields.String(description='The item status', required=True, enum=list(HARVEST_ITEM_STATUS)), From 3403944f7cda11dcd0276d83b6e52928d1bbfbb0 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 5 Jun 2024 10:44:37 +0200 Subject: [PATCH 41/60] Revert XSLT modifications --- udata/core/dataset/rdf.py | 6 ------ udata/harvest/backends/dcat.py | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index 0c331dd35c..2009591906 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -426,12 +426,6 @@ def resource_from_rdf(graph_or_distrib, dataset=None, is_additionnal=False): if not url: log.warning(f'Resource without url: {distrib}') return - - try: - url = uris.validate(url) - except uris.ValidationError: - log.warning(f'Resource with invalid url: {url}') - return if dataset: resource = get_by(dataset.resources, 'url', url) diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index 4c9e771b7c..68ea397d9c 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -272,7 +272,7 @@ class CswIso19139DcatBackend(DcatBackend): ISO_SCHEMA = 'http://www.isotc211.org/2005/gmd' - XSL_URL = "https://raw.githubusercontent.com/datagouv/iso-19139-to-dcat-ap/empty_function_to_distribution/iso-19139-to-dcat-ap.xsl" + XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl" def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, None]: """ From 7c5b0ca0ef1f59e6ae9cf571bedf10d2d07ae7fc Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 5 Jun 2024 10:45:35 +0200 Subject: [PATCH 42/60] Remove some blanks lines --- udata/harvest/backends/dcat.py | 1 - udata/harvest/tests/test_dcat_backend.py | 1 - 2 files changed, 2 deletions(-) diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py index 68ea397d9c..fdc96b466d 100644 --- a/udata/harvest/backends/dcat.py +++ b/udata/harvest/backends/dcat.py @@ -324,7 +324,6 @@ def walk_graph(self, url: str, fmt: str) -> Generator[tuple[int, Graph], None, N headers=headers) response.raise_for_status() - tree_before_transform = ET.fromstring(response.content) # Disabling CoupledResourceLookUp to prevent failure on xlink:href # https://github.com/SEMICeu/iso-19139-to-dcat-ap/blob/master/documentation/HowTo.md#parameter-coupledresourcelookup diff --git a/udata/harvest/tests/test_dcat_backend.py b/udata/harvest/tests/test_dcat_backend.py index c8d5374352..1b69e94211 100644 --- a/udata/harvest/tests/test_dcat_backend.py +++ b/udata/harvest/tests/test_dcat_backend.py @@ -143,7 +143,6 @@ def test_flat_with_blank_nodes(self, rmock): assert datasets['1'].resources[0].mime == 'application/json' @pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas', HARVEST_MAX_CATALOG_SIZE_IN_MONGO=None, HARVEST_GRAPHS_S3_BUCKET="test_bucket", S3_URL="https://example.org", S3_ACCESS_KEY_ID="myUser", S3_SECRET_ACCESS_KEY="password") - def test_flat_with_blank_nodes_xml(self, rmock): rmock.get('https://example.com/schemas', json=ResourceSchemaMockData.get_mock_data()) From 8e3e5ef336856f96501dfefd2d4767e28bc5d6d3 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 5 Jun 2024 11:05:21 +0200 Subject: [PATCH 43/60] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f9721232e..a81d502b2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - Allow dataservices to be discussed and followed [#3049](https://github.com/opendatateam/udata/pull/3049) - Add purge-dataservices job [#3049](https://github.com/opendatateam/udata/pull/3049) - Harvest all the available polygons from a spatial coverage [#3039](https://github.com/opendatateam/udata/pull/3039) +- Harvest dataservices [#3029](https://github.com/opendatateam/udata/pull/3029) ## 8.0.1 (2024-05-28) From 9a9a43b8313b306377056f7c7f6b2c777973c135 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 5 Jun 2024 16:00:36 +0200 Subject: [PATCH 44/60] Do not duplicate datasets on each harvesting --- udata/core/dataservices/rdf.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/udata/core/dataservices/rdf.py b/udata/core/dataservices/rdf.py index 7e9780573e..cb2bd863e4 100644 --- a/udata/core/dataservices/rdf.py +++ b/udata/core/dataservices/rdf.py @@ -10,7 +10,7 @@ from udata.rdf import DCAT, DCT, contact_point_from_rdf, rdf_value, theme_labels_from_rdf, themes_from_rdf, url_from_rdf -def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, datasets: List[Dataset]) -> Dataservice : +def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, all_datasets: List[Dataset]) -> Dataservice : ''' Create or update a dataset from a RDF/DCAT graph ''' @@ -27,16 +27,19 @@ def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, datasets: dataservice.contact_point = contact_point_from_rdf(d, dataservice) or dataservice.contact_point + datasets = [] for dataset_node in d.objects(DCAT.servesDataset): id = dataset_node.value(DCT.identifier) - dataset = next((d for d in datasets if d is not None and d.harvest.remote_id == id), None) + dataset = next((d for d in all_datasets if d is not None and d.harvest.remote_id == id), None) if dataset is None: # We try with `endswith` because Europe XSLT have problems with IDs. Sometimes they are prefixed with the domain of the catalog, sometimes not. - dataset = next((d for d in datasets if d is not None and d.harvest.remote_id.endswith(id)), None) + dataset = next((d for d in all_datasets if d is not None and d.harvest.remote_id.endswith(id)), None) if dataset is not None: - dataservice.datasets.append(dataset.id) + datasets.append(dataset.id) + + dataservice.datasets = datasets license = rdf_value(d, DCT.license) if license is not None: From 5fee125cd7a5faf64ffc5eb64ef6f17bf111f7c5 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Mon, 10 Jun 2024 11:31:47 +0200 Subject: [PATCH 45/60] Cleanup imports --- udata/core/dataset/rdf.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index cbff089cef..9ab0d3923e 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -6,7 +6,6 @@ import logging from datetime import date -from html.parser import HTMLParser from typing import Optional from dateutil.parser import parse as parse_dt from flask import current_app @@ -20,14 +19,11 @@ from udata.core.spatial.models import SpatialCoverage from udata.core.dataset.models import HarvestDatasetMetadata, HarvestResourceMetadata from udata.harvest.exceptions import HarvestSkipException -from udata.models import db, ContactPoint +from udata.models import db from udata.rdf import ( - DCAT, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, VCARD, RDFS, contact_point_from_rdf, - namespace_manager, rdf_value, sanitize_html, schema_from_rdf, themes_from_rdf, url_from_rdf - , DCATAP, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, VCARD, RDFS, - HVD_LEGISLATION, namespace_manager, schema_from_rdf, url_from_rdf + DCAT, DCATAP, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, VCARD, RDFS, contact_point_from_rdf, + namespace_manager, rdf_value, sanitize_html, schema_from_rdf, themes_from_rdf, url_from_rdf, HVD_LEGISLATION ) -from udata.tags import slug as slugify_tag from udata.utils import get_by, safe_unicode from udata.uris import endpoint_for From 31bfda791c34dc1cc8b62427c26415a068c85d36 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Mon, 10 Jun 2024 11:32:22 +0200 Subject: [PATCH 46/60] Fix changelog --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 67f38f435e..163b48f7ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ## Current (in progress) -- Nothing yet +- Harvest dataservices [#3029](https://github.com/opendatateam/udata/pull/3029) ## 9.0.0 (2024-06-07) @@ -11,7 +11,6 @@ - Allow dataservices to be discussed and followed [#3049](https://github.com/opendatateam/udata/pull/3049) - Add purge-dataservices job [#3049](https://github.com/opendatateam/udata/pull/3049) - Harvest all the available polygons from a spatial coverage [#3039](https://github.com/opendatateam/udata/pull/3039) -- Harvest dataservices [#3029](https://github.com/opendatateam/udata/pull/3029) ## 8.0.1 (2024-05-28) From f477ef6af916b2c35f35da7228b1070401750a29 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Mon, 10 Jun 2024 13:27:22 +0200 Subject: [PATCH 47/60] Fix merge --- udata/core/dataset/rdf.py | 5 +++-- udata/rdf.py | 3 ++- udata/tests/dataset/test_dataset_rdf.py | 5 ++++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index 9ab0d3923e..28e3233da2 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -21,8 +21,9 @@ from udata.harvest.exceptions import HarvestSkipException from udata.models import db from udata.rdf import ( - DCAT, DCATAP, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, VCARD, RDFS, contact_point_from_rdf, - namespace_manager, rdf_value, sanitize_html, schema_from_rdf, themes_from_rdf, url_from_rdf, HVD_LEGISLATION + DCAT, DCATAP, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, TAG_TO_EU_HVD_CATEGORIES, RDFS, + namespace_manager, rdf_value, sanitize_html, schema_from_rdf, themes_from_rdf, url_from_rdf, HVD_LEGISLATION, + contact_point_from_rdf, ) from udata.utils import get_by, safe_unicode from udata.uris import endpoint_for diff --git a/udata/rdf.py b/udata/rdf.py index 2d69d49b69..6d53a9c1aa 100644 --- a/udata/rdf.py +++ b/udata/rdf.py @@ -18,6 +18,7 @@ from udata.models import Schema from udata.mongo.errors import FieldValidationError from udata.frontend.markdown import parse_html +from udata.tags import slug as slugify_tag log = logging.getLogger(__name__) @@ -113,7 +114,7 @@ "http://data.europa.eu/bna/c_e1da4e07": "Statistiques" } HVD_LEGISLATION = 'http://data.europa.eu/eli/reg_impl/2023/138/oj' - +TAG_TO_EU_HVD_CATEGORIES = {slugify_tag(EU_HVD_CATEGORIES[uri]): uri for uri in EU_HVD_CATEGORIES} def guess_format(string): '''Guess format given an extension or a mime-type''' diff --git a/udata/tests/dataset/test_dataset_rdf.py b/udata/tests/dataset/test_dataset_rdf.py index 4ee7fb7ed5..c5a09a332b 100644 --- a/udata/tests/dataset/test_dataset_rdf.py +++ b/udata/tests/dataset/test_dataset_rdf.py @@ -18,7 +18,10 @@ from udata.core.dataset.rdf import ( dataset_to_rdf, dataset_from_rdf, resource_to_rdf, resource_from_rdf, temporal_from_rdf, frequency_to_rdf, frequency_from_rdf, - EU_RDF_REQUENCIES, TAG_TO_EU_HVD_CATEGORIES + EU_RDF_REQUENCIES +) +from udata.rdf import ( + TAG_TO_EU_HVD_CATEGORIES ) from udata.core.organization.factories import OrganizationFactory from udata.i18n import gettext as _ From 9f9a7f86069bdd895ce421b67c36d9a8b0aa4a6b Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 11 Jun 2024 14:30:29 +0200 Subject: [PATCH 48/60] Apply suggestions from code review Co-authored-by: maudetes --- udata/core/dataset/rdf.py | 2 +- udata/harvest/backends/base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index 28e3233da2..af126ff65e 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -523,7 +523,7 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None): dataset.title = rdf_value(d, DCT.title) if not dataset.title: - raise HarvestSkipException("missing title") + raise HarvestSkipException("missing title on dataset") # Support dct:abstract if dct:description is missing (sometimes used instead) description = d.value(DCT.description) or d.value(DCT.abstract) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index f024559034..071f029895 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -308,7 +308,7 @@ def update_dataservice_harvest_info(self, harvest: Optional[HarvestDataserviceMe harvest.domain = self.source.domain harvest.source_id = str(self.source.id) - harvest.source_id = str(self.source.url) + harvest.source_url = str(self.source.url) harvest.remote_id = remote_id harvest.last_harvested_at = datetime.utcnow() From 2a0da24f2b6b910deded8c68b42e845bd2362147 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 11 Jun 2024 14:46:47 +0200 Subject: [PATCH 49/60] add harvest metadata to API --- udata/api_fields.py | 10 +++++++--- udata/core/dataservices/models.py | 17 +++++++++-------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/udata/api_fields.py b/udata/api_fields.py index 1f9128cfc1..40afb212b4 100644 --- a/udata/api_fields.py +++ b/udata/api_fields.py @@ -70,10 +70,14 @@ def convert_db_to_field(key, field, info = {}): constructor_write = restx_fields.String elif isinstance(field, mongo_fields.EmbeddedDocumentField): nested_fields = info.get('nested_fields') - if nested_fields is None: - raise ValueError(f"EmbeddedDocumentField `{key}` requires a `nested_fields` param to serialize/deserialize.") + if nested_fields is not None: + constructor = lambda **kwargs: restx_fields.Nested(nested_fields, **kwargs) + elif hasattr(field.document_type_obj, '__read_fields__'): + constructor_read = lambda **kwargs: restx_fields.Nested(field.document_type_obj.__read_fields__, **kwargs) + constructor_write = lambda **kwargs: restx_fields.Nested(field.document_type_obj.__write_fields__, **kwargs) + else: + raise ValueError(f"EmbeddedDocumentField `{key}` requires a `nested_fields` param to serialize/deserialize or a `@generate_fields()` definition.") - constructor = lambda **kwargs: restx_fields.Nested(nested_fields, **kwargs) else: raise ValueError(f"Unsupported MongoEngine field type {field.__class__.__name__}") diff --git a/udata/core/dataservices/models.py b/udata/core/dataservices/models.py index bd123c6b01..4288a65a09 100644 --- a/udata/core/dataservices/models.py +++ b/udata/core/dataservices/models.py @@ -31,17 +31,18 @@ def hidden(self): db.Q(deleted_at__ne=None) | db.Q(archived_at__ne=None)) +@generate_fields() class HarvestMetadata(db.DynamicEmbeddedDocument): - backend = db.StringField() - domain = db.StringField() + backend = field(db.StringField()) + domain = field(db.StringField()) - source_id = db.StringField() - source_url = db.URLField() + source_id = field(db.StringField()) + source_url = field(db.URLField()) - remote_id = db.StringField() + remote_id = field(db.StringField()) - last_harvested_at = db.DateTimeField() - archived_at = db.DateTimeField() + last_harvested_at = field(db.DateTimeField()) + archived_at = field(db.DateTimeField()) @generate_fields() class Dataservice(WithMetrics, Owned, db.Document): @@ -128,7 +129,7 @@ class Dataservice(WithMetrics, Owned, db.Document): ) ) - harvest = db.EmbeddedDocumentField(HarvestMetadata) + harvest = field(db.EmbeddedDocumentField(HarvestMetadata)) @function_field(description="Link to the API endpoint for this dataservice") def self_api_url(self): From dd541580adcb17eadda81c664202c27e6db5e53d Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 11 Jun 2024 14:49:17 +0200 Subject: [PATCH 50/60] harvest metadata as readonly --- udata/core/dataservices/models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udata/core/dataservices/models.py b/udata/core/dataservices/models.py index 4288a65a09..1c92c32269 100644 --- a/udata/core/dataservices/models.py +++ b/udata/core/dataservices/models.py @@ -129,7 +129,10 @@ class Dataservice(WithMetrics, Owned, db.Document): ) ) - harvest = field(db.EmbeddedDocumentField(HarvestMetadata)) + harvest = field( + db.EmbeddedDocumentField(HarvestMetadata), + readonly=True, + ) @function_field(description="Link to the API endpoint for this dataservice") def self_api_url(self): From 30ba48d17fdc490975d0136240d0343738d907bd Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 11 Jun 2024 16:02:15 +0200 Subject: [PATCH 51/60] Rename last_harvested_at and add harvest.created_at --- udata/core/dataservices/models.py | 9 ++++++++- udata/core/dataservices/rdf.py | 2 +- udata/harvest/backends/base.py | 8 ++++---- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/udata/core/dataservices/models.py b/udata/core/dataservices/models.py index 1c92c32269..4649cebfcd 100644 --- a/udata/core/dataservices/models.py +++ b/udata/core/dataservices/models.py @@ -41,7 +41,14 @@ class HarvestMetadata(db.DynamicEmbeddedDocument): remote_id = field(db.StringField()) - last_harvested_at = field(db.DateTimeField()) + created_at = field( + db.DatetimeField(), + description="Date of the creation as provided by the harvested catalog" + ) + last_update = field( + db.DateTimeField(), + description="Date of the last harvesting" + ) archived_at = field(db.DateTimeField()) @generate_fields() diff --git a/udata/core/dataservices/rdf.py b/udata/core/dataservices/rdf.py index cb2bd863e4..3f596522fc 100644 --- a/udata/core/dataservices/rdf.py +++ b/udata/core/dataservices/rdf.py @@ -45,7 +45,7 @@ def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, all_datas if license is not None: dataservice.license = License.guess(license) - dataservice.created_at = rdf_value(d, DCT.issued) + dataservice.harvest.created_at = rdf_value(d, DCT.issued) dataservice.metadata_modified_at = rdf_value(d, DCT.modified) dataservice.tags = themes_from_rdf(d) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 071f029895..76f3bb770a 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -251,6 +251,9 @@ def process_dataservice(self, remote_id: str, **kwargs) -> bool : if not remote_id: raise HarvestSkipException("missing identifier") + if not dataservice.harvest: + dataservice.harvest = HarvestDataserviceMetadata() + dataservice = self.inner_process_dataservice(item, **kwargs) dataservice.harvest = self.update_dataservice_harvest_info(dataservice.harvest, remote_id) @@ -300,10 +303,7 @@ def update_dataset_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], return harvest - def update_dataservice_harvest_info(self, harvest: Optional[HarvestDataserviceMetadata], remote_id: int): - if not harvest: - harvest = HarvestDataserviceMetadata() - + def update_dataservice_harvest_info(self, harvest: HarvestDataserviceMetadata, remote_id: int): harvest.backend = self.display_name harvest.domain = self.source.domain From b083c3726d0eee8bb836727caa749448cf0364c9 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 11 Jun 2024 16:05:08 +0200 Subject: [PATCH 52/60] Fix wrong attribute --- udata/harvest/backends/base.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index 76f3bb770a..da7ac782c4 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -194,6 +194,9 @@ def process_dataset(self, remote_id: str, **kwargs): if not remote_id: raise HarvestSkipException("missing identifier") + if not dataset.harvest: + dataset.harvest = HarvestDatasetMetadata() + current_app.logger.addHandler(log_catcher) dataset = self.inner_process_dataset(item, **kwargs) @@ -287,10 +290,7 @@ def process_dataservice(self, remote_id: str, **kwargs) -> bool : item.ended = datetime.utcnow() self.save_job() - def update_dataset_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], remote_id: int): - if not harvest: - harvest = HarvestDatasetMetadata() - + def update_dataset_harvest_info(self, harvest: HarvestDatasetMetadata, remote_id: int): harvest.backend = self.display_name harvest.source_id = str(self.source.id) harvest.remote_id = remote_id @@ -311,7 +311,7 @@ def update_dataservice_harvest_info(self, harvest: HarvestDataserviceMetadata, r harvest.source_url = str(self.source.url) harvest.remote_id = remote_id - harvest.last_harvested_at = datetime.utcnow() + harvest.last_update = datetime.utcnow() harvest.archived_at = None From fbe6f2212f77605c04a415e33deb55830bc76dcd Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 11 Jun 2024 16:06:19 +0200 Subject: [PATCH 53/60] Do not empty the datasets list if no datasets found in harvesting --- udata/core/dataservices/rdf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udata/core/dataservices/rdf.py b/udata/core/dataservices/rdf.py index 3f596522fc..3d84b35376 100644 --- a/udata/core/dataservices/rdf.py +++ b/udata/core/dataservices/rdf.py @@ -39,7 +39,8 @@ def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, all_datas if dataset is not None: datasets.append(dataset.id) - dataservice.datasets = datasets + if datasets: + dataservice.datasets = datasets license = rdf_value(d, DCT.license) if license is not None: From 786354755d5ee9ef323e6076aed592cdcf6a9c07 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 11 Jun 2024 17:23:48 +0200 Subject: [PATCH 54/60] fix casing --- udata/core/dataservices/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udata/core/dataservices/models.py b/udata/core/dataservices/models.py index 4649cebfcd..aa2dfe7010 100644 --- a/udata/core/dataservices/models.py +++ b/udata/core/dataservices/models.py @@ -42,7 +42,7 @@ class HarvestMetadata(db.DynamicEmbeddedDocument): remote_id = field(db.StringField()) created_at = field( - db.DatetimeField(), + db.DateTimeField(), description="Date of the creation as provided by the harvested catalog" ) last_update = field( From b4017001651d4c87098ea1a75c800ccb463713a8 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 11 Jun 2024 17:41:18 +0200 Subject: [PATCH 55/60] Fix tests --- udata/core/dataservices/rdf.py | 6 ++++-- udata/harvest/backends/base.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/udata/core/dataservices/rdf.py b/udata/core/dataservices/rdf.py index 3d84b35376..4c3b832238 100644 --- a/udata/core/dataservices/rdf.py +++ b/udata/core/dataservices/rdf.py @@ -3,13 +3,12 @@ from typing import List, Optional from rdflib import RDF, Graph -from udata.core.dataservices.models import Dataservice, HarvestMetadata +from udata.core.dataservices.models import Dataservice, HarvestMetadata as HarvestDataserviceMetadata from udata.core.dataset.models import Dataset, License from udata.core.dataset.rdf import sanitize_html from udata.harvest.models import HarvestSource from udata.rdf import DCAT, DCT, contact_point_from_rdf, rdf_value, theme_labels_from_rdf, themes_from_rdf, url_from_rdf - def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, all_datasets: List[Dataset]) -> Dataservice : ''' Create or update a dataset from a RDF/DCAT graph @@ -46,6 +45,9 @@ def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, all_datas if license is not None: dataservice.license = License.guess(license) + if not dataservice.harvest: + dataservice.harvest = HarvestDataserviceMetadata() + dataservice.harvest.created_at = rdf_value(d, DCT.issued) dataservice.metadata_modified_at = rdf_value(d, DCT.modified) diff --git a/udata/harvest/backends/base.py b/udata/harvest/backends/base.py index da7ac782c4..4496e71a94 100644 --- a/udata/harvest/backends/base.py +++ b/udata/harvest/backends/base.py @@ -194,9 +194,6 @@ def process_dataset(self, remote_id: str, **kwargs): if not remote_id: raise HarvestSkipException("missing identifier") - if not dataset.harvest: - dataset.harvest = HarvestDatasetMetadata() - current_app.logger.addHandler(log_catcher) dataset = self.inner_process_dataset(item, **kwargs) @@ -254,9 +251,6 @@ def process_dataservice(self, remote_id: str, **kwargs) -> bool : if not remote_id: raise HarvestSkipException("missing identifier") - if not dataservice.harvest: - dataservice.harvest = HarvestDataserviceMetadata() - dataservice = self.inner_process_dataservice(item, **kwargs) dataservice.harvest = self.update_dataservice_harvest_info(dataservice.harvest, remote_id) @@ -290,7 +284,10 @@ def process_dataservice(self, remote_id: str, **kwargs) -> bool : item.ended = datetime.utcnow() self.save_job() - def update_dataset_harvest_info(self, harvest: HarvestDatasetMetadata, remote_id: int): + def update_dataset_harvest_info(self, harvest: Optional[HarvestDatasetMetadata], remote_id: int): + if not harvest: + harvest = HarvestDatasetMetadata() + harvest.backend = self.display_name harvest.source_id = str(self.source.id) harvest.remote_id = remote_id @@ -303,7 +300,10 @@ def update_dataset_harvest_info(self, harvest: HarvestDatasetMetadata, remote_id return harvest - def update_dataservice_harvest_info(self, harvest: HarvestDataserviceMetadata, remote_id: int): + def update_dataservice_harvest_info(self, harvest: Optional[HarvestDataserviceMetadata], remote_id: int): + if not harvest: + harvest = HarvestDataserviceMetadata() + harvest.backend = self.display_name harvest.domain = self.source.domain From 1d6714ad26d9bba3250dc3c8f536a739fdac393c Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 12 Jun 2024 14:35:00 +0200 Subject: [PATCH 56/60] Save node id if it's an URL --- udata/core/dataservices/rdf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/udata/core/dataservices/rdf.py b/udata/core/dataservices/rdf.py index 4c3b832238..542d6be1d0 100644 --- a/udata/core/dataservices/rdf.py +++ b/udata/core/dataservices/rdf.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import List, Optional -from rdflib import RDF, Graph +from rdflib import RDF, Graph, URIRef from udata.core.dataservices.models import Dataservice, HarvestMetadata as HarvestDataserviceMetadata from udata.core.dataset.models import Dataset, License @@ -48,6 +48,10 @@ def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, all_datas if not dataservice.harvest: dataservice.harvest = HarvestDataserviceMetadata() + # If the node ID is a `URIRef` it means it links to something external, if it's not an URIRef it's often a + # auto-generated ID just to link multiple RDF node togethers. When exporting as RDF to other catalogs, we + # want to re-use this node ID (only if it's not auto-generated) to improve compatibility. + dataservice.harvest.rdf_node_id_as_url = d.identifier.toPython() if isinstance(d.identifier, URIRef) else None dataservice.harvest.created_at = rdf_value(d, DCT.issued) dataservice.metadata_modified_at = rdf_value(d, DCT.modified) From cc6b5e8f967f9ce3233e1f93492b33b4b7b8be5f Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Thu, 13 Jun 2024 09:06:31 +0200 Subject: [PATCH 57/60] Add landing page as remote_url --- udata/core/dataservices/models.py | 1 + udata/core/dataservices/rdf.py | 3 ++- udata/core/dataset/rdf.py | 18 +----------------- udata/harvest/tests/test_dcat_backend.py | 1 + udata/rdf.py | 15 +++++++++++++++ 5 files changed, 20 insertions(+), 18 deletions(-) diff --git a/udata/core/dataservices/models.py b/udata/core/dataservices/models.py index aa2dfe7010..69fa82428c 100644 --- a/udata/core/dataservices/models.py +++ b/udata/core/dataservices/models.py @@ -40,6 +40,7 @@ class HarvestMetadata(db.DynamicEmbeddedDocument): source_url = field(db.URLField()) remote_id = field(db.StringField()) + remote_url = field(db.URLField()) created_at = field( db.DateTimeField(), diff --git a/udata/core/dataservices/rdf.py b/udata/core/dataservices/rdf.py index 542d6be1d0..73167b0f27 100644 --- a/udata/core/dataservices/rdf.py +++ b/udata/core/dataservices/rdf.py @@ -7,7 +7,7 @@ from udata.core.dataset.models import Dataset, License from udata.core.dataset.rdf import sanitize_html from udata.harvest.models import HarvestSource -from udata.rdf import DCAT, DCT, contact_point_from_rdf, rdf_value, theme_labels_from_rdf, themes_from_rdf, url_from_rdf +from udata.rdf import DCAT, DCT, contact_point_from_rdf, rdf_value, remote_url_from_rdf, theme_labels_from_rdf, themes_from_rdf, url_from_rdf def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, all_datasets: List[Dataset]) -> Dataservice : ''' @@ -52,6 +52,7 @@ def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, all_datas # auto-generated ID just to link multiple RDF node togethers. When exporting as RDF to other catalogs, we # want to re-use this node ID (only if it's not auto-generated) to improve compatibility. dataservice.harvest.rdf_node_id_as_url = d.identifier.toPython() if isinstance(d.identifier, URIRef) else None + dataservice.harvest.remote_url = remote_url_from_rdf(d) dataservice.harvest.created_at = rdf_value(d, DCT.issued) dataservice.metadata_modified_at = rdf_value(d, DCT.modified) diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index af126ff65e..e6adeb0652 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -22,7 +22,7 @@ from udata.models import db from udata.rdf import ( DCAT, DCATAP, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, TAG_TO_EU_HVD_CATEGORIES, RDFS, - namespace_manager, rdf_value, sanitize_html, schema_from_rdf, themes_from_rdf, url_from_rdf, HVD_LEGISLATION, + namespace_manager, rdf_value, remote_url_from_rdf, sanitize_html, schema_from_rdf, themes_from_rdf, url_from_rdf, HVD_LEGISLATION, contact_point_from_rdf, ) from udata.utils import get_by, safe_unicode @@ -429,22 +429,6 @@ def title_from_rdf(rdf, url): else: return i18n._('Nameless resource') - -def remote_url_from_rdf(rdf): - ''' - Return DCAT.landingPage if found and uri validation succeeds. - Use RDF identifier as fallback if uri validation succeeds. - ''' - landing_page = url_from_rdf(rdf, DCAT.landingPage) - uri = rdf.identifier.toPython() - for candidate in [landing_page, uri]: - if candidate: - try: - uris.validate(candidate) - return candidate - except uris.ValidationError: - pass - def resource_from_rdf(graph_or_distrib, dataset=None, is_additionnal=False): ''' Map a Resource domain model to a DCAT/RDF graph diff --git a/udata/harvest/tests/test_dcat_backend.py b/udata/harvest/tests/test_dcat_backend.py index 3cd2a60979..5fbc7a57a6 100644 --- a/udata/harvest/tests/test_dcat_backend.py +++ b/udata/harvest/tests/test_dcat_backend.py @@ -180,6 +180,7 @@ def test_harvest_dataservices(self, rmock): assert dataservices[0].title == "Explore API v2" assert dataservices[0].base_api_url == "https://data.paris2024.org/api/explore/v2.1/" assert dataservices[0].endpoint_description_url == "https://data.paris2024.org/api/explore/v2.1/swagger.json" + assert dataservices[0].harvest.remote_url == "https://data.paris2024.org/api/explore/v2.1/console" def test_harvest_literal_spatial(self, rmock): url = mock_dcat(rmock, 'evian.json') diff --git a/udata/rdf.py b/udata/rdf.py index 6d53a9c1aa..893434e429 100644 --- a/udata/rdf.py +++ b/udata/rdf.py @@ -322,6 +322,21 @@ def contact_point_from_rdf(rdf, dataset): return (contact_point or ContactPoint(name=name, email=email, owner=dataset.owner).save()) +def remote_url_from_rdf(rdf): + ''' + Return DCAT.landingPage if found and uri validation succeeds. + Use RDF identifier as fallback if uri validation succeeds. + ''' + landing_page = url_from_rdf(rdf, DCAT.landingPage) + uri = rdf.identifier.toPython() + for candidate in [landing_page, uri]: + if candidate: + try: + uris.validate(candidate) + return candidate + except uris.ValidationError: + pass + def schema_from_rdf(rdf): ''' Try to extract a schema from a conformsTo property. From d4841996b38f590c123b0fc96b3fd86d9db26087 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Fri, 14 Jun 2024 09:29:40 +0200 Subject: [PATCH 58/60] Rename rdf_node_id_as_url to follow the same name as dataset :-( --- udata/core/dataservices/models.py | 8 ++++++++ udata/core/dataservices/rdf.py | 5 +---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/udata/core/dataservices/models.py b/udata/core/dataservices/models.py index e86d467601..56e4530a97 100644 --- a/udata/core/dataservices/models.py +++ b/udata/core/dataservices/models.py @@ -42,6 +42,14 @@ class HarvestMetadata(db.DynamicEmbeddedDocument): remote_id = field(db.StringField()) remote_url = field(db.URLField()) + # If the node ID is a `URIRef` it means it links to something external, if it's not an `URIRef` it's often a + # auto-generated ID just to link multiple RDF node togethers. When exporting as RDF to other catalogs, we + # want to re-use this node ID (only if it's not auto-generated) to improve compatibility. + uri = field( + db.URLField(), + description="RDF node ID if it's an `URIRef`. `None` if it's not present or if it's a random auto-generated ID inside the graph.", + ) + created_at = field( db.DateTimeField(), description="Date of the creation as provided by the harvested catalog" diff --git a/udata/core/dataservices/rdf.py b/udata/core/dataservices/rdf.py index 73167b0f27..53ad83b1b7 100644 --- a/udata/core/dataservices/rdf.py +++ b/udata/core/dataservices/rdf.py @@ -48,10 +48,7 @@ def dataservice_from_rdf(graph: Graph, dataservice: Dataservice, node, all_datas if not dataservice.harvest: dataservice.harvest = HarvestDataserviceMetadata() - # If the node ID is a `URIRef` it means it links to something external, if it's not an URIRef it's often a - # auto-generated ID just to link multiple RDF node togethers. When exporting as RDF to other catalogs, we - # want to re-use this node ID (only if it's not auto-generated) to improve compatibility. - dataservice.harvest.rdf_node_id_as_url = d.identifier.toPython() if isinstance(d.identifier, URIRef) else None + dataservice.harvest.uri = d.identifier.toPython() if isinstance(d.identifier, URIRef) else None dataservice.harvest.remote_url = remote_url_from_rdf(d) dataservice.harvest.created_at = rdf_value(d, DCT.issued) dataservice.metadata_modified_at = rdf_value(d, DCT.modified) From fa816cc98f42e8b0c847370f002ebba3bb2de029 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Fri, 14 Jun 2024 09:30:23 +0200 Subject: [PATCH 59/60] Remove dynamic to harvest dataservice metadata --- udata/core/dataservices/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udata/core/dataservices/models.py b/udata/core/dataservices/models.py index 56e4530a97..537b6a7676 100644 --- a/udata/core/dataservices/models.py +++ b/udata/core/dataservices/models.py @@ -32,7 +32,7 @@ def hidden(self): db.Q(archived_at__ne=None)) @generate_fields() -class HarvestMetadata(db.DynamicEmbeddedDocument): +class HarvestMetadata(db.EmbeddedDocument): backend = field(db.StringField()) domain = field(db.StringField()) From 546c1be44a57a7de9c22817fe690927016d9e044 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Fri, 14 Jun 2024 09:29:53 +0200 Subject: [PATCH 60/60] Update udata/core/dataservices/models.py Co-authored-by: maudetes --- udata/core/dataservices/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udata/core/dataservices/models.py b/udata/core/dataservices/models.py index 537b6a7676..1cc3f86fe2 100644 --- a/udata/core/dataservices/models.py +++ b/udata/core/dataservices/models.py @@ -159,7 +159,7 @@ def self_api_url(self): @function_field(description="Link to the udata web page for this dataservice") def self_web_url(self): - return endpoint_for('dataservices.show', dataservice=self) + return endpoint_for('dataservices.show', dataservice=self, _external=True) # TODO # frequency = db.StringField(choices=list(UPDATE_FREQUENCIES.keys()))