Skip to content

Commit

Permalink
Add more data inside dataservices
Browse files Browse the repository at this point in the history
  • Loading branch information
ThibaudDauce committed Apr 29, 2024
1 parent c68b455 commit cece558
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 87 deletions.
15 changes: 15 additions & 0 deletions udata/core/dataservices/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,19 @@ def hidden(self):
db.Q(deleted_at__ne=None) |
db.Q(archived_at__ne=None))

class HarvestMetadata(db.DynamicEmbeddedDocument):
backend = db.StringField()

source_id = db.StringField()
source_url = db.StringField()

dct_identifier = db.StringField()

remote_id = db.StringField()
remote_url = db.URLField()

last_harvested_at = db.DateTimeField()

@generate_fields()
class Dataservice(WithMetrics, Owned, db.Document):
meta = {
Expand Down Expand Up @@ -116,6 +129,8 @@ class Dataservice(WithMetrics, Owned, db.Document):
)
)

harvest = db.EmbeddedDocumentField(HarvestMetadata)

@function_field(description="Link to the API endpoint for this dataservice")
def self_api_url(self):
return endpoint_for('api.dataservice', dataservice=self, _external=True)
Expand Down
42 changes: 33 additions & 9 deletions udata/core/dataservices/rdf.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@

from pprint import pprint
from datetime import datetime
from typing import Optional
from rdflib import RDF, Graph
import rdflib
import lxml.etree as ET

from udata.core.dataservices.models import Dataservice
from udata.core.dataset.rdf import rdf_value, remote_url_from_rdf
from udata.rdf import DCAT, DCT, url_from_rdf
from udata.core.dataservices.models import Dataservice, HarvestMetadata
from udata.core.dataset.models import License
from udata.core.dataset.rdf import sanitize_html
from udata.harvest.models import HarvestSource
from udata.rdf import DCAT, DCT, contact_point_from_rdf, rdf_value, theme_labels_from_rdf, themes_from_rdf, url_from_rdf


def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None, node=None):
def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None, node=None, source: Optional[HarvestSource] = None):
'''
Create or update a dataset from a RDF/DCAT graph
'''
Expand All @@ -22,7 +22,31 @@ def dataservice_from_rdf(graph: Graph, dataservice: Optional[Dataservice] = None
d = graph.resource(node)

dataservice.title = rdf_value(d, DCT.title)
dataservice.base_api_url = rdf_value(d, DCAT.endpointURL)
dataservice.endpoint_description_url = rdf_value(d, DCAT.endpointDescription)
dataservice.description = sanitize_html(d.value(DCT.description) or d.value(DCT.abstract))

dataservice.base_api_url = url_from_rdf(d, DCAT.endpointURL)
dataservice.endpoint_description_url = url_from_rdf(d, DCAT.endpointDescription)

dataservice.contact_point = contact_point_from_rdf(d, dataservice) or dataservice.contact_point

license = rdf_value(d, DCT.license)
if license is not None:
dataservice.license = License.guess(license)

dataservice.created_at = rdf_value(d, DCT.issued)
dataservice.metadata_modified_at = rdf_value(d, DCT.modified)

dataservice.tags = themes_from_rdf(d)

if not dataservice.harvest:
dataservice.harvest = HarvestMetadata()

if source is not None:
dataservice.harvest.backend = source.backend

dataservice.harvest.source_id = source.id
dataservice.harvest.source_url = source.url

dataservice.harvest.last_harvested_at = datetime.utcnow()

return dataservice
81 changes: 3 additions & 78 deletions udata/core/dataset/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@

from udata import i18n, uris
from udata.core.spatial.models import SpatialCoverage
from udata.frontend.markdown import parse_html
from udata.core.dataset.models import HarvestDatasetMetadata, HarvestResourceMetadata
from udata.models import db, ContactPoint
from udata.rdf import (
DCAT, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, VCARD, RDFS,
namespace_manager, schema_from_rdf, url_from_rdf
DCAT, DCT, FREQ, SCV, SKOS, SPDX, SCHEMA, EUFREQ, EUFORMAT, IANAFORMAT, VCARD, RDFS, contact_point_from_rdf,
namespace_manager, rdf_value, sanitize_html, schema_from_rdf, theme_labels_from_rdf, themes_from_rdf, url_from_rdf
)
from udata.utils import get_by, safe_unicode
from udata.uris import endpoint_for
Expand Down Expand Up @@ -77,32 +76,6 @@
}


class HTMLDetector(HTMLParser):
def __init__(self, *args, **kwargs):
HTMLParser.__init__(self, *args, **kwargs)
self.elements = set()

def handle_starttag(self, tag, attrs):
self.elements.add(tag)

def handle_endtag(self, tag):
self.elements.add(tag)


def is_html(text):
parser = HTMLDetector()
parser.feed(text)
return bool(parser.elements)


def sanitize_html(text):
text = text.toPython() if isinstance(text, Literal) else ''
if is_html(text):
return parse_html(text)
else:
return text.strip()


def temporal_to_rdf(daterange, graph=None):
if not daterange:
return
Expand Down Expand Up @@ -231,18 +204,6 @@ def dataset_to_rdf(dataset, graph=None):
}


def serialize_value(value):
if isinstance(value, (URIRef, Literal)):
return value.toPython()
elif isinstance(value, RdfResource):
return value.identifier.toPython()


def rdf_value(obj, predicate, default=None):
value = obj.value(predicate)
return serialize_value(value) if value else default


def temporal_from_literal(text):
'''
Parse a temporal coverage from a literal ie. either:
Expand Down Expand Up @@ -317,29 +278,6 @@ def temporal_from_rdf(period_of_time):
# so we log the error for future investigation and improvement
log.warning('Unable to parse temporal coverage', exc_info=True)


def contact_point_from_rdf(rdf, dataset):
contact_point = rdf.value(DCAT.contactPoint)
if contact_point:
name = rdf_value(contact_point, VCARD.fn) or ''
email = (rdf_value(contact_point, VCARD.hasEmail)
or rdf_value(contact_point, VCARD.email)
or rdf_value(contact_point, DCAT.email))
if not email:
return
email = email.replace('mailto:', '').strip()
if dataset.organization:
contact_point = ContactPoint.objects(
name=name, email=email, organization=dataset.organization).first()
return (contact_point or
ContactPoint(name=name, email=email, organization=dataset.organization).save())
elif dataset.owner:
contact_point = ContactPoint.objects(
name=name, email=email, owner=dataset.owner).first()
return (contact_point or
ContactPoint(name=name, email=email, owner=dataset.owner).save())


def spatial_from_rdf(graph):
geojsons = []
for term in graph.objects(DCT.spatial):
Expand Down Expand Up @@ -467,17 +405,6 @@ def remote_url_from_rdf(rdf):
except uris.ValidationError:
pass


def theme_labels_from_rdf(rdf):
for theme in rdf.objects(DCAT.theme):
if isinstance(theme, RdfResource):
label = rdf_value(theme, SKOS.prefLabel)
else:
label = theme.toPython()
if label:
yield label


def resource_from_rdf(graph_or_distrib, dataset=None, is_additionnal=False):
'''
Map a Resource domain model to a DCAT/RDF graph
Expand Down Expand Up @@ -572,9 +499,7 @@ def dataset_from_rdf(graph: Graph, dataset=None, node=None):
if acronym:
dataset.acronym = acronym

tags = [tag.toPython() for tag in d.objects(DCAT.keyword)]
tags += theme_labels_from_rdf(d)
dataset.tags = list(set(tags))
dataset.tags = themes_from_rdf(d)

temporal_coverage = temporal_from_rdf(d.value(DCT.temporal))
if temporal_coverage:
Expand Down
73 changes: 73 additions & 0 deletions udata/rdf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
'''
This module centralize udata-wide RDF helpers and configuration
'''
from html.parser import HTMLParser
import logging
import re

Expand All @@ -13,8 +14,10 @@
)
from rdflib.util import SUFFIX_FORMAT_MAP, guess_format as raw_guess_format
from udata import uris
from udata.core.contact_point.models import ContactPoint
from udata.models import Schema
from udata.mongo.errors import FieldValidationError
from udata.frontend.markdown import parse_html

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -212,6 +215,42 @@ def want_rdf():
'totalItems': 'hydra:totalItems',
}

def serialize_value(value):
if isinstance(value, (URIRef, Literal)):
return value.toPython()
elif isinstance(value, RdfResource):
return value.identifier.toPython()


def rdf_value(obj, predicate, default=None):
value = obj.value(predicate)
return serialize_value(value) if value else default

class HTMLDetector(HTMLParser):
def __init__(self, *args, **kwargs):
HTMLParser.__init__(self, *args, **kwargs)
self.elements = set()

def handle_starttag(self, tag, attrs):
self.elements.add(tag)

def handle_endtag(self, tag):
self.elements.add(tag)


def is_html(text):
parser = HTMLDetector()
parser.feed(text)
return bool(parser.elements)


def sanitize_html(text):
text = text.toPython() if isinstance(text, Literal) else ''
if is_html(text):
return parse_html(text)
else:
return text.strip()


def url_from_rdf(rdf, prop):
'''
Expand All @@ -224,6 +263,40 @@ def url_from_rdf(rdf, prop):
elif isinstance(value, RdfResource):
return value.identifier.toPython()

def theme_labels_from_rdf(rdf):
for theme in rdf.objects(DCAT.theme):
if isinstance(theme, RdfResource):
label = rdf_value(theme, SKOS.prefLabel)
else:
label = theme.toPython()
if label:
yield label

def themes_from_rdf(rdf):
tags = [tag.toPython() for tag in rdf.objects(DCAT.keyword)]
tags += theme_labels_from_rdf(rdf)
return list(set(tags))

def contact_point_from_rdf(rdf, dataset):
contact_point = rdf.value(DCAT.contactPoint)
if contact_point:
name = rdf_value(contact_point, VCARD.fn) or ''
email = (rdf_value(contact_point, VCARD.hasEmail)
or rdf_value(contact_point, VCARD.email)
or rdf_value(contact_point, DCAT.email))
if not email:
return
email = email.replace('mailto:', '').strip()
if dataset.organization:
contact_point = ContactPoint.objects(
name=name, email=email, organization=dataset.organization).first()
return (contact_point or
ContactPoint(name=name, email=email, organization=dataset.organization).save())
elif dataset.owner:
contact_point = ContactPoint.objects(
name=name, email=email, owner=dataset.owner).first()
return (contact_point or
ContactPoint(name=name, email=email, owner=dataset.owner).save())

def schema_from_rdf(rdf):
'''
Expand Down

0 comments on commit cece558

Please sign in to comment.