Skip to content

Commit

Permalink
Add/study (#131)
Browse files Browse the repository at this point in the history
* Add boilerplate for `studies` endpoints

* Support modern-style task studies

* Add run study support, but it is untested

It is untested because the test database currently does not have
any runs or run studies.

* Add get study by alias

* Move SQL queries to database submodule

* Add migration test

* Add information on the lack of support for legacy studies

* Add flows and setups are now also always returned
  • Loading branch information
PGijsbers authored Dec 15, 2023
1 parent 5e95710 commit 486173e
Show file tree
Hide file tree
Showing 9 changed files with 698 additions and 2 deletions.
39 changes: 39 additions & 0 deletions docs/migration.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,45 @@ includes datasets which are private.
The `limit` and `offset` parameters can now be used independently, you no longer need
to provide both if you wish to set only one.

## Studies

### `GET /{id_or_alias}`

Old-style "legacy" studies which are solely based on tags are no longer supported.

??? info "Affected Legacy Studies"

Only 24 old studies were affected by this change, listed below.
There is currently not yet a migration plan for these studies.

| id | name|
| --: | :-- |
|1 |A large-scale comparison of classification algorit...|
|2 |Fast Algorithm Selection using Learning Curves|
|3 |Multi-Task Learning with a Natural Metric for Quan...|
|5 |Local and Global Feature Selection on Multilabel T...|
|7 |Massive machine learning experiments using mlr and...|
|8 |Decision tree comparaison|
|10| Collaborative primer|
|11| Having a Blast: Meta-Learning and Heterogeneous En...|
|12| Subspace Clustering via Seeking Neighbors with Min...|
|13| Meta-QSAR: learning how to learn QSARs|
|17| Subgroup Discovery|
|20| Mythbusting data mining urban legends through larg...|
|22| Identifying critical paths in undergraduate progra...|
|24| OpenML R paper|
|25| Bernd Demo Study for Multiclass SVMs OML WS 2016|
|27| Compare three different SVM versions of R package ...|
|30| OpenML Paper Study|
|31| Iris Data set Study|
|32| Data Streams and more|
|34| Massively Collaborative Machine Learning|
|37| Speeding up Algorithm Selection via Meta-learning ...|
|38| Performance of new ctree implementations on classi...|
|41| ASLib OpenML Scenario|
|50| Hyper-parameter tuning of Decision Trees|
|51| ensemble on diabetes |

## Others

### `GET /estimationprocedure/list`
Expand Down
4 changes: 2 additions & 2 deletions src/core/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@


def _str_to_bool(string: str) -> bool:
if string.casefold() in ["true", "1", "yes"]:
if string.casefold() in ["true", "1", "yes", "y"]:
return True
if string.casefold() in ["false", "0", "no"]:
if string.casefold() in ["false", "0", "no", "n"]:
return False
msg = f"Could not parse {string=} as bool."
raise ValueError(msg)
Expand Down
68 changes: 68 additions & 0 deletions src/database/studies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from typing import cast

from schemas.study import StudyType
from sqlalchemy import Connection, Row, text


def get_study_by_id(study_id: int, connection: Connection) -> Row:
return connection.execute(
text(
"""
SELECT *, main_entity_type as type_
FROM study
WHERE id = :study_id
""",
),
parameters={"study_id": study_id},
).fetchone()


def get_study_by_alias(alias: str, connection: Connection) -> Row:
return connection.execute(
text(
"""
SELECT *, main_entity_type as type_
FROM study
WHERE alias = :study_id
""",
),
parameters={"study_id": alias},
).fetchone()


def get_study_data(study: Row, expdb: Connection) -> list[Row]:
if study.type_ == StudyType.TASK:
return cast(
list[Row],
expdb.execute(
text(
"""
SELECT ts.task_id as task_id, ti.value as data_id
FROM task_study as ts LEFT JOIN task_inputs ti ON ts.task_id = ti.task_id
WHERE ts.study_id = :study_id AND ti.input = 'source_data'
""",
),
parameters={"study_id": study.id},
).fetchall(),
)
return cast(
list[Row],
expdb.execute(
text(
"""
SELECT
rs.run_id as run_id,
run.task_id as task_id,
run.setup as setup_id,
ti.value as data_id,
setup.implementation_id as flow_id
FROM run_study as rs
JOIN run ON run.rid = rs.run_id
JOIN algorithm_setup as setup ON setup.sid = run.setup
JOIN task_inputs as ti ON ti.task_id = run.task_id
WHERE rs.study_id = :study_id AND ti.input = 'source_data'
""",
),
parameters={"study_id": study.id},
).fetchall(),
)
2 changes: 2 additions & 0 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from routers.openml.evaluations import router as evaluationmeasures_router
from routers.openml.flows import router as flows_router
from routers.openml.qualities import router as qualities_router
from routers.openml.study import router as study_router
from routers.openml.tasks import router as task_router
from routers.openml.tasktype import router as ttype_router

Expand Down Expand Up @@ -49,6 +50,7 @@ def create_api() -> FastAPI:
app.include_router(estimationprocedure_router)
app.include_router(task_router)
app.include_router(flows_router)
app.include_router(study_router)
return app


Expand Down
62 changes: 62 additions & 0 deletions src/routers/openml/study.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import http.client
from typing import Annotated

from core.formatting import _str_to_bool
from database.studies import get_study_by_alias, get_study_by_id, get_study_data
from database.users import User, UserGroup
from fastapi import APIRouter, Depends, HTTPException
from schemas.core import Visibility
from schemas.study import Study, StudyType
from sqlalchemy import Connection, Row

from routers.dependencies import expdb_connection, fetch_user

router = APIRouter(prefix="/studies", tags=["studies"])


def _get_study_raise_otherwise(id_or_alias: int | str, user: User | None, expdb: Connection) -> Row:
if isinstance(id_or_alias, int) or id_or_alias.isdigit():
study = get_study_by_id(int(id_or_alias), expdb)
else:
study = get_study_by_alias(id_or_alias, expdb)

if study is None:
raise HTTPException(status_code=http.client.NOT_FOUND, detail="Study not found.")
if study.visibility == Visibility.PRIVATE:
if user is None:
raise HTTPException(status_code=http.client.UNAUTHORIZED, detail="Study is private.")
if study.creator != user.user_id and UserGroup.ADMIN not in user.groups:
raise HTTPException(status_code=http.client.FORBIDDEN, detail="Study is private.")
if _str_to_bool(study.legacy):
raise HTTPException(
status_code=http.client.GONE,
detail="Legacy studies are no longer supported",
)

return study


@router.get("/{alias_or_id}")
def get_study(
alias_or_id: int | str,
user: Annotated[User | None, Depends(fetch_user)] = None,
expdb: Annotated[Connection, Depends(expdb_connection)] = None,
) -> Study:
study = _get_study_raise_otherwise(alias_or_id, user, expdb)
study_data = get_study_data(study, expdb)
return Study(
id_=study.id,
name=study.name,
alias=study.alias,
main_entity_type=study.type_,
description=study.description,
visibility=study.visibility,
status=study.status,
creation_date=study.creation_date,
creator=study.creator,
data_ids=[row.data_id for row in study_data],
task_ids=[row.task_id for row in study_data],
run_ids=[row.run_id for row in study_data] if study.type_ == StudyType.RUN else [],
flow_ids=[row.flow_id for row in study_data] if study.type_ == StudyType.RUN else [],
setup_ids=[row.setup_id for row in study_data] if study.type_ == StudyType.RUN else [],
)
6 changes: 6 additions & 0 deletions src/schemas/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from enum import StrEnum, auto


class Visibility(StrEnum):
PUBLIC = auto()
PRIVATE = auto()
34 changes: 34 additions & 0 deletions src/schemas/study.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from datetime import datetime
from enum import StrEnum, auto

from pydantic import BaseModel, Field

from schemas.core import Visibility


class StudyType(StrEnum):
RUN = auto()
TASK = auto()


class StudyStatus(StrEnum):
ACTIVE = auto()
DEACTIVATED = auto()
IN_PREPARATION = auto()


class Study(BaseModel):
id_: int = Field(serialization_alias="id")
name: str
alias: str | None
main_entity_type: StudyType
description: str
visibility: Visibility
status: StudyStatus
creation_date: datetime
creator: int
task_ids: list[int]
run_ids: list[int]
data_ids: list[int]
setup_ids: list[int]
flow_ids: list[int]
36 changes: 36 additions & 0 deletions tests/routers/openml/migration/studies_migration_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import deepdiff
import httpx
import pytest
from core.conversions import nested_num_to_str, nested_remove_nones
from starlette.testclient import TestClient


@pytest.mark.php()
def test_get_study_equal(py_api: TestClient, php_api: httpx.Client) -> None:
new = py_api.get("/studies/1")
old = php_api.get("/study/1")
assert new.status_code == old.status_code

new = new.json()
# New implementation is typed
new = nested_num_to_str(new)
# New implementation has same fields even if empty
new = nested_remove_nones(new)
new["tasks"] = {"task_id": new.pop("task_ids")}
new["data"] = {"data_id": new.pop("data_ids")}
if runs := new.pop("run_ids", None):
new["runs"] = {"run_id": runs}
if flows := new.pop("flow_ids", None):
new["flows"] = {"flow_id": flows}
if setups := new.pop("setup_ids", None):
new["setup"] = {"setup_id": setups}

# New implementation is not nested
new = {"study": new}
difference = deepdiff.diff.DeepDiff(
new,
old.json(),
ignore_order=True,
ignore_numeric_type_changes=True,
)
assert not difference
Loading

0 comments on commit 486173e

Please sign in to comment.