Skip to content

Commit

Permalink
Merge branch 'main' into add/docs
Browse files Browse the repository at this point in the history
  • Loading branch information
PGijsbers committed Oct 18, 2023
2 parents bff897c + 8219ec8 commit b2af709
Show file tree
Hide file tree
Showing 12 changed files with 323 additions and 203 deletions.
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
rev: v4.5.0
hooks:
- id: check-ast
- id: check-toml
Expand All @@ -25,20 +25,20 @@ repos:


- repo: https://github.com/pre-commit/mirrors-mypy
rev: 'v1.2.0'
rev: 'v1.6.0'
hooks:
- id: mypy
additional_dependencies:
- fastapi
- pytest

- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: v0.0.265
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.292
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]

- repo: https://github.com/psf/black
rev: 23.3.0
rev: 23.9.1
hooks:
- id: black
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ authors = [
]
description = "The Python-based REST API for OpenML."
readme = "README.md"
requires-python = ">=3.11"
requires-python = ">=3.12"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
Expand All @@ -17,6 +17,7 @@ dependencies = [
"uvicorn",
"sqlalchemy",
"mysqlclient",
"python_dotenv",
]

[project.optional-dependencies]
Expand Down
45 changes: 45 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import functools
import os
import tomllib
import typing
from pathlib import Path

from dotenv import load_dotenv

TomlTable = dict[str, typing.Any]


def _apply_defaults_to_siblings(configuration: TomlTable) -> TomlTable:
defaults = configuration["defaults"]
return {
subtable: (defaults | overrides) if isinstance(overrides, dict) else overrides
for subtable, overrides in configuration.items()
if subtable != "defaults"
}


@functools.cache
def load_database_configuration(file: Path = Path(__file__).parent / "config.toml") -> TomlTable:
configuration = tomllib.loads(file.read_text())

database_configuration = _apply_defaults_to_siblings(
configuration["databases"],
)
load_dotenv()
database_configuration["openml"]["username"] = os.environ.get(
"OPENML_DATABASES_OPENML_USERNAME",
"root",
)
database_configuration["openml"]["password"] = os.environ.get(
"OPENML_DATABASES_OPENML_PASSWORD",
"ok",
)
database_configuration["expdb"]["username"] = os.environ.get(
"OPENML_DATABASES_EXPDB_USERNAME",
"root",
)
database_configuration["expdb"]["password"] = os.environ.get(
"OPENML_DATABASES_EXPDB_PASSWORD",
"ok",
)
return database_configuration
11 changes: 11 additions & 0 deletions src/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[databases.defaults]
host="127.0.0.1"
port="3306"
# SQLAlchemy `dialect` and `driver`: https://docs.sqlalchemy.org/en/20/dialects/index.html
drivername="mysql"

[databases.expdb]
database="openml_expdb"

[databases.openml]
database="openml"
9 changes: 7 additions & 2 deletions src/database/datasets.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
""" Translation from https://github.com/openml/OpenML/blob/c19c9b99568c0fabb001e639ff6724b9a754bbc9/openml_OS/models/api/v1/Api_data.php#L707"""
from typing import Any

from config import load_database_configuration
from sqlalchemy import create_engine, text
from sqlalchemy.engine import URL

from database.meta import get_column_names

_database_configuration = load_database_configuration()
expdb_url = URL.create(**_database_configuration["expdb"])
expdb = create_engine(
"mysql://root:[email protected]:3306/openml_expdb",
expdb_url,
echo=True,
pool_recycle=3600,
)
openml_url = URL.create(**_database_configuration["openml"])
openml = create_engine(
"mysql://root:[email protected]:3306/openml",
openml_url,
echo=True,
pool_recycle=3600,
)
Expand Down
11 changes: 5 additions & 6 deletions src/database/users.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from pydantic import ConstrainedStr
from typing import Annotated

from pydantic import StringConstraints
from sqlalchemy import create_engine, text

from database.meta import get_column_names
Expand All @@ -9,11 +11,8 @@
pool_recycle=3600,
)


class APIKey(ConstrainedStr):
"""Enforces str is 32 hexadecimal characters, does not check validity."""

regex = r"^[0-9a-fA-F]{32}$"
# Enforces str is 32 hexadecimal characters, does not check validity.
APIKey = Annotated[str, StringConstraints(pattern=r"^[0-9a-fA-F]{32}$")]


def get_user_id_for(*, api_key: APIKey) -> int | None:
Expand Down
10 changes: 5 additions & 5 deletions src/routers/old/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@ def get_dataset_wrapped(
api_key: APIKey | None = None,
) -> dict[str, dict[str, Any]]:
try:
dataset = get_dataset(dataset_id, api_key).dict(by_alias=True)
dataset = get_dataset(dataset_id, api_key).model_dump(by_alias=True)
except HTTPException as e:
raise HTTPException(
status_code=http.client.PRECONDITION_FAILED,
detail=e.detail,
) from None
if dataset.get("processing_date"):
dataset["processing_date"] = str(dataset["processing_date"]).replace("T", " ")
if dataset.get("parquet_url"):
dataset["parquet_url"] = dataset["parquet_url"].replace("https", "http")
if processing_data := dataset.get("processing_date"):
dataset["processing_date"] = str(processing_data).replace("T", " ")
if parquet_url := dataset.get("parquet_url"):
dataset["parquet_url"] = str(parquet_url).replace("https", "http")

manual = []
# ref test.openml.org/d/33 (contributor) and d/34 (creator)
Expand Down
112 changes: 55 additions & 57 deletions src/schemas/datasets/dcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from abc import ABC
from typing import Union

from pydantic import BaseModel, Extra, Field
from pydantic import BaseModel, Field


class DcatAPContext(BaseModel):
Expand All @@ -28,143 +28,143 @@ class DcatAPContext(BaseModel):
class DcatAPObject(BaseModel, ABC):
"""Base class for all DCAT-AP objects"""

id_: str = Field(alias="@id")
id_: str = Field(serialization_alias="@id")

class Config:
extra = Extra.forbid
allow_population_by_field_name = True
model_config = {"populate_by_name": True, "extra": "forbid"}


class DcatAPIdentifier(DcatAPObject):
"""Identifying another DcatAPObject. Contains only an id."""


class VCardIndividual(DcatAPObject):
type_: str = Field(default="vcard:Individual", alias="@type", const=True)
type_: str = Field(default="vcard:Individual", serialization_alias="@type", const=True)
fn: str = Field(
alias="vcard:fn",
serialization_alias="vcard:fn",
description="The formatted text corresponding to the name of the object",
)


class VCardOrganisation(DcatAPObject):
type_: str = Field(default="vcard:Organisation", alias="@type", const=True)
type_: str = Field(default="vcard:Organisation", serialization_alias="@type", const=True)
fn: str = Field(
alias="vcard:fn",
serialization_alias="vcard:fn",
description="The formatted text corresponding to the name of the object",
)


class DcatLocation(DcatAPObject):
type_: str = Field(default="dct:Location", alias="@type", const=True)
bounding_box: str | None = Field(alias="dcat:bbox", default=None)
centroid: str | None = Field(alias="dcat:centroid", default=None)
geometry: str | None = Field(alias="dcat:geometry", default=None)
type_: str = Field(default="dct:Location", serialization_alias="@type", const=True)
bounding_box: str | None = Field(serialization_alias="dcat:bbox", default=None)
centroid: str | None = Field(serialization_alias="dcat:centroid", default=None)
geometry: str | None = Field(serialization_alias="dcat:geometry", default=None)


class SpdxChecksum(DcatAPObject):
type_: str = Field(default="spdx:Checksum", alias="@type", const=True)
algorithm: str = Field(alias="spdx:algorithm")
value: str = Field(alias="spdx:checksumValue")
type_: str = Field(default="spdx:Checksum", serialization_alias="@type", const=True)
algorithm: str = Field(serialization_alias="spdx:algorithm")
value: str = Field(serialization_alias="spdx:checksumValue")


class XSDDateTime(BaseModel):
type_: str = Field(default="xsd:dateTime", alias="@type", const=True)
value_: datetime.datetime | datetime.date = Field(alias="@value")
type_: str = Field(default="xsd:dateTime", serialization_alias="@type", const=True)
value_: datetime.datetime | datetime.date = Field(serialization_alias="@value")

class Config:
extra = Extra.forbid
allow_population_by_field_name = True
model_config = {"populate_by_name": True, "extra": "forbid"}


class DctPeriodOfTime(DcatAPObject):
type_: str = Field(default="dct:PeriodOfTime", alias="@type", const=True)
start_date: XSDDateTime | None = Field(alias="dcat:startDate", default=None)
end_date: XSDDateTime | None = Field(alias="dcat:endDate", default=None)
type_: str = Field(default="dct:PeriodOfTime", serialization_alias="@type", const=True)
start_date: XSDDateTime | None = Field(serialization_alias="dcat:startDate", default=None)
end_date: XSDDateTime | None = Field(serialization_alias="dcat:endDate", default=None)


class DcatAPDistribution(DcatAPObject):
type_: str = Field(default="dcat:Distribution", alias="@type", const=True)
type_: str = Field(default="dcat:Distribution", serialization_alias="@type", const=True)
access_url: list[str] = Field(
alias="dcat:accessURL",
serialization_alias="dcat:accessURL",
default_factory=list,
min_items=1,
min_length=1,
)
byte_size: int | None = Field(alias="dcat:byteSize", default=None)
checksum: DcatAPIdentifier | None = Field(alias="spdx:checksum", default=None)
description: list[str] = Field(alias="dct:description", default_factory=list)
download_url: list[str] = Field(alias="dcat:downloadURL", default_factory=list)
format_: str | None = Field(alias="dct:format", default=None)
license_: str | None = Field(alias="dct:license", default=None)
title: list[str] = Field(alias="dct:title", default_factory=list)
byte_size: int | None = Field(serialization_alias="dcat:byteSize", default=None)
checksum: DcatAPIdentifier | None = Field(serialization_alias="spdx:checksum", default=None)
description: list[str] = Field(serialization_alias="dct:description", default_factory=list)
download_url: list[str] = Field(serialization_alias="dcat:downloadURL", default_factory=list)
format_: str | None = Field(serialization_alias="dct:format", default=None)
license_: str | None = Field(serialization_alias="dct:license", default=None)
title: list[str] = Field(serialization_alias="dct:title", default_factory=list)


class DcatAPDataset(DcatAPObject):
type_: str = Field(default="dcat:Dataset", alias="@type", const=True)
type_: str = Field(default="dcat:Dataset", serialization_alias="@type", const=True)
description: list[str] = Field(
alias="dct:description",
serialization_alias="dct:description",
description="A free-text account of the Dataset",
default_factory=list,
min_items=1,
min_length=1,
)
title: list[str] = Field(
alias="dct:title",
serialization_alias="dct:title",
description="The name given to the Dataset",
default_factory=list,
min_items=1,
min_length=1,
)
contact_point: list[DcatAPIdentifier] = Field(
alias="dcat:contactPoint",
serialization_alias="dcat:contactPoint",
description="Contact information to send comments about the Dataset to.",
default_factory=list,
)
distribution: list[DcatAPIdentifier] = Field(
alias="dcat:distribution",
serialization_alias="dcat:distribution",
default_factory=list,
)
keyword: list[str] = Field(alias="dcat:keyword", default_factory=list)
keyword: list[str] = Field(serialization_alias="dcat:keyword", default_factory=list)
publisher: DcatAPIdentifier | None = Field(
alias="dct:publisher",
serialization_alias="dct:publisher",
description="The entity (organisation) responsible for making the Dataset available.",
default=None,
)
temporal_coverage: list[DcatAPIdentifier] = Field(
alias="dct:temporal",
serialization_alias="dct:temporal",
description="The temporal period that the Dataset covers.",
default_factory=list,
)
spatial_coverage: list[DcatAPIdentifier] = Field(
alias="dct:spatial",
serialization_alias="dct:spatial",
description="The geographic region that is covered by the Dataset.",
default_factory=list,
)
theme: list[str] = Field(
alias="dcat:theme",
serialization_alias="dcat:theme",
description="Any categories that may be associated with the Dataset.",
default_factory=list,
)

creator: list[DcatAPIdentifier] = Field(alias="dcat:creator", default_factory=list)
documentation: list[str] = Field(alias="foaf:page", default_factory=list)
creator: list[DcatAPIdentifier] = Field(
serialization_alias="dcat:creator",
default_factory=list,
)
documentation: list[str] = Field(serialization_alias="foaf:page", default_factory=list)
landing_page: list[str] = Field(
alias="dcat:landingPage",
serialization_alias="dcat:landingPage",
description="The web page that provides access to "
"the Dataset, its Distributions and/or additional information. "
"It is intended to point to a landing page at the original data "
"provider, not to a page on a site of a third party, "
"such as an aggregator.",
default_factory=list,
)
release_date: XSDDateTime | None = Field(alias="dct:issued")
update_date: XSDDateTime | None = Field(alias="dct:modified")
version: str | None = Field(alias="owl:versionInfo")
release_date: XSDDateTime | None = Field(serialization_alias="dct:issued", default=None)
update_date: XSDDateTime | None = Field(serialization_alias="dct:modified", default=None)
version: str | None = Field(serialization_alias="owl:versionInfo", default=None)


class DcatApWrapper(BaseModel):
"""The resulting class, containing a dataset and related entities in the graph"""

context_: DcatAPContext = Field(
default=DcatAPContext(),
alias="@context",
serialization_alias="@context",
const=True,
)
# instead of list[DcatAPObject], a union with all the possible values is necessary.
Expand All @@ -179,8 +179,6 @@ class DcatApWrapper(BaseModel):
VCardIndividual,
DctPeriodOfTime,
]
] = Field(alias="@graph")
] = Field(serialization_alias="@graph")

class Config:
extra = Extra.forbid
allow_population_by_field_name = True
model_config = {"populate_by_name": True, "extra": "forbid"}
Loading

0 comments on commit b2af709

Please sign in to comment.