From cece5584a2c74749e3aae62dde71f1d446f0d602 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Mon, 29 Apr 2024 16:55:06 +0200 Subject: [PATCH] Add more data inside dataservices --- udata/core/dataservices/models.py | 15 ++++++ udata/core/dataservices/rdf.py | 42 ++++++++++++---- udata/core/dataset/rdf.py | 81 ++----------------------------- udata/rdf.py | 73 ++++++++++++++++++++++++++++ 4 files changed, 124 insertions(+), 87 deletions(-) diff --git a/udata/core/dataservices/models.py b/udata/core/dataservices/models.py index 74e7c97b33..918acfbf1b 100644 --- a/udata/core/dataservices/models.py +++ b/udata/core/dataservices/models.py @@ -31,6 +31,19 @@ def hidden(self): db.Q(deleted_at__ne=None) | db.Q(archived_at__ne=None)) +class HarvestMetadata(db.DynamicEmbeddedDocument): + backend = db.StringField() + + source_id = db.StringField() + source_url = db.StringField() + + dct_identifier = db.StringField() + + remote_id = db.StringField() + remote_url = db.URLField() + + last_harvested_at = db.DateTimeField() + @generate_fields() class Dataservice(WithMetrics, Owned, db.Document): meta = { @@ -116,6 +129,8 @@ class Dataservice(WithMetrics, Owned, db.Document): ) ) + harvest = db.EmbeddedDocumentField(HarvestMetadata) + @function_field(description="Link to the API endpoint for this dataservice") def self_api_url(self): return endpoint_for('api.dataservice', dataservice=self, _external=True) diff --git a/udata/core/dataservices/rdf.py b/udata/core/dataservices/rdf.py index 5a30c107b3..74fb4620c9 100644 --- a/udata/core/dataservices/rdf.py +++ b/udata/core/dataservices/rdf.py @@ -1,16 +1,16 @@ -from pprint import pprint +from datetime import datetime from typing import Optional from rdflib import RDF, Graph -import rdflib -import lxml.etree as ET -from udata.core.dataservices.models import Dataservice -from udata.core.dataset.rdf import rdf_value, remote_url_from_rdf -from udata.rdf import DCAT, DCT, url_from_rdf +from udata.core.dataservices.models import Dataservice, HarvestMetadata +from udata.core.dataset.models import License +from udata.core.dataset.rdf import sanitize_html +from udata.harvest.models import HarvestSource +from udata.rdf import DCAT, DCT, contact_point_from_rdf, rdf_value, theme_labels_from_rdf, themes_from_rdf, url_from_rdf -def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None, node=None): +def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None, node=None, source: Optional[HarvestSource] = None): ''' Create or update a dataset from a RDF/DCAT graph ''' @@ -22,7 +22,31 @@ def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None d = graph.resource(node) dataservice.title = rdf_value(d, DCT.title) - dataservice.base_api_url = rdf_value(d, DCAT.endpointURL) - dataservice.endpoint_description_url = rdf_value(d, DCAT.endpointDescription) + dataservice.description = sanitize_html(d.value(DCT.description) or d.value(DCT.abstract)) + + dataservice.base_api_url = url_from_rdf(d, DCAT.endpointURL) + dataservice.endpoint_description_url = url_from_rdf(d, DCAT.endpointDescription) + + dataservice.contact_point = contact_point_from_rdf(d, dataservice) or dataservice.contact_point + + license = rdf_value(d, DCT.license) + if license is not None: + dataservice.license = License.guess(license) + + dataservice.created_at = rdf_value(d, DCT.issued) + dataservice.metadata_modified_at = rdf_value(d, DCT.modified) + + dataservice.tags = themes_from_rdf(d) + + if not dataservice.harvest: + dataservice.harvest = HarvestMetadata() + + if source is not None: + dataservice.harvest.backend = source.backend + + dataservice.harvest.source_id = source.id + dataservice.harvest.source_url = source.url + + dataservice.harvest.last_harvested_at = datetime.utcnow() return dataservice \ No newline at end of file diff --git a/udata/core/dataset/rdf.py b/udata/core/dataset/rdf.py index 793cd0c4ef..dc768c6211 100644 --- a/udata/core/dataset/rdf.py +++ b/udata/core/dataset/rdf.py @@ -18,12 +18,11 @@ from udata import i18n, uris from udata.core.spatial.models import SpatialCoverage -from udata.frontend.markdown import parse_html from udata.core.dataset.models import HarvestDatasetMetadata, HarvestResourceMetadata from udata.models import db, ContactPoint from udata.rdf import ( - DCAT, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, VCARD, RDFS, - namespace_manager, schema_from_rdf, url_from_rdf + DCAT, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, VCARD, RDFS, contact_point_from_rdf, + namespace_manager, rdf_value, sanitize_html, schema_from_rdf, theme_labels_from_rdf, themes_from_rdf, url_from_rdf ) from udata.utils import get_by, safe_unicode from udata.uris import endpoint_for @@ -77,32 +76,6 @@ } -class HTMLDetector(HTMLParser): - def __init__(self, *args, **kwargs): - HTMLParser.__init__(self, *args, **kwargs) - self.elements = set() - - def handle_starttag(self, tag, attrs): - self.elements.add(tag) - - def handle_endtag(self, tag): - self.elements.add(tag) - - -def is_html(text): - parser = HTMLDetector() - parser.feed(text) - return bool(parser.elements) - - -def sanitize_html(text): - text = text.toPython() if isinstance(text, Literal) else '' - if is_html(text): - return parse_html(text) - else: - return text.strip() - - def temporal_to_rdf(daterange, graph=None): if not daterange: return @@ -231,18 +204,6 @@ def dataset_to_rdf(dataset, graph=None): } -def serialize_value(value): - if isinstance(value, (URIRef, Literal)): - return value.toPython() - elif isinstance(value, RdfResource): - return value.identifier.toPython() - - -def rdf_value(obj, predicate, default=None): - value = obj.value(predicate) - return serialize_value(value) if value else default - - def temporal_from_literal(text): ''' Parse a temporal coverage from a literal ie. either: @@ -317,29 +278,6 @@ def temporal_from_rdf(period_of_time): # so we log the error for future investigation and improvement log.warning('Unable to parse temporal coverage', exc_info=True) - -def contact_point_from_rdf(rdf, dataset): - contact_point = rdf.value(DCAT.contactPoint) - if contact_point: - name = rdf_value(contact_point, VCARD.fn) or '' - email = (rdf_value(contact_point, VCARD.hasEmail) - or rdf_value(contact_point, VCARD.email) - or rdf_value(contact_point, DCAT.email)) - if not email: - return - email = email.replace('mailto:', '').strip() - if dataset.organization: - contact_point = ContactPoint.objects( - name=name, email=email, organization=dataset.organization).first() - return (contact_point or - ContactPoint(name=name, email=email, organization=dataset.organization).save()) - elif dataset.owner: - contact_point = ContactPoint.objects( - name=name, email=email, owner=dataset.owner).first() - return (contact_point or - ContactPoint(name=name, email=email, owner=dataset.owner).save()) - - def spatial_from_rdf(graph): geojsons = [] for term in graph.objects(DCT.spatial): @@ -467,17 +405,6 @@ def remote_url_from_rdf(rdf): except uris.ValidationError: pass - -def theme_labels_from_rdf(rdf): - for theme in rdf.objects(DCAT.theme): - if isinstance(theme, RdfResource): - label = rdf_value(theme, SKOS.prefLabel) - else: - label = theme.toPython() - if label: - yield label - - def resource_from_rdf(graph_or_distrib, dataset=None, is_additionnal=False): ''' Map a Resource domain model to a DCAT/RDF graph @@ -572,9 +499,7 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None): if acronym: dataset.acronym = acronym - tags = [tag.toPython() for tag in d.objects(DCAT.keyword)] - tags += theme_labels_from_rdf(d) - dataset.tags = list(set(tags)) + dataset.tags = themes_from_rdf(d) temporal_coverage = temporal_from_rdf(d.value(DCT.temporal)) if temporal_coverage: diff --git a/udata/rdf.py b/udata/rdf.py index 1203b1bb92..57d0de9077 100644 --- a/udata/rdf.py +++ b/udata/rdf.py @@ -1,6 +1,7 @@ ''' This module centralize udata-wide RDF helpers and configuration ''' +from html.parser import HTMLParser import logging import re @@ -13,8 +14,10 @@ ) from rdflib.util import SUFFIX_FORMAT_MAP, guess_format as raw_guess_format from udata import uris +from udata.core.contact_point.models import ContactPoint from udata.models import Schema from udata.mongo.errors import FieldValidationError +from udata.frontend.markdown import parse_html log = logging.getLogger(__name__) @@ -212,6 +215,42 @@ def want_rdf(): 'totalItems': 'hydra:totalItems', } +def serialize_value(value): + if isinstance(value, (URIRef, Literal)): + return value.toPython() + elif isinstance(value, RdfResource): + return value.identifier.toPython() + + +def rdf_value(obj, predicate, default=None): + value = obj.value(predicate) + return serialize_value(value) if value else default + +class HTMLDetector(HTMLParser): + def __init__(self, *args, **kwargs): + HTMLParser.__init__(self, *args, **kwargs) + self.elements = set() + + def handle_starttag(self, tag, attrs): + self.elements.add(tag) + + def handle_endtag(self, tag): + self.elements.add(tag) + + +def is_html(text): + parser = HTMLDetector() + parser.feed(text) + return bool(parser.elements) + + +def sanitize_html(text): + text = text.toPython() if isinstance(text, Literal) else '' + if is_html(text): + return parse_html(text) + else: + return text.strip() + def url_from_rdf(rdf, prop): ''' @@ -224,6 +263,40 @@ def url_from_rdf(rdf, prop): elif isinstance(value, RdfResource): return value.identifier.toPython() +def theme_labels_from_rdf(rdf): + for theme in rdf.objects(DCAT.theme): + if isinstance(theme, RdfResource): + label = rdf_value(theme, SKOS.prefLabel) + else: + label = theme.toPython() + if label: + yield label + +def themes_from_rdf(rdf): + tags = [tag.toPython() for tag in rdf.objects(DCAT.keyword)] + tags += theme_labels_from_rdf(rdf) + return list(set(tags)) + +def contact_point_from_rdf(rdf, dataset): + contact_point = rdf.value(DCAT.contactPoint) + if contact_point: + name = rdf_value(contact_point, VCARD.fn) or '' + email = (rdf_value(contact_point, VCARD.hasEmail) + or rdf_value(contact_point, VCARD.email) + or rdf_value(contact_point, DCAT.email)) + if not email: + return + email = email.replace('mailto:', '').strip() + if dataset.organization: + contact_point = ContactPoint.objects( + name=name, email=email, organization=dataset.organization).first() + return (contact_point or + ContactPoint(name=name, email=email, organization=dataset.organization).save()) + elif dataset.owner: + contact_point = ContactPoint.objects( + name=name, email=email, owner=dataset.owner).first() + return (contact_point or + ContactPoint(name=name, email=email, owner=dataset.owner).save()) def schema_from_rdf(rdf): '''