Add/study (#131)

* Add boilerplate for `studies` endpoints * Support modern-style task studies * Add run study support, but it is untested It is untested because the test database currently does not have any runs or run studies. * Add get study by alias * Move SQL queries to database submodule * Add migration test * Add information on the lack of support for legacy studies * Add flows and setups are now also always returned
openml · Dec 15, 2023 · 486173e · 486173e
1 parent 5e95710
commit 486173e
Show file tree

Hide file tree

Showing 9 changed files with 698 additions and 2 deletions.
diff --git a/docs/migration.md b/docs/migration.md
@@ -91,6 +91,45 @@ includes datasets which are private.
 The `limit` and `offset` parameters can now be used independently, you no longer need
 to provide both if you wish to set only one.
 
+## Studies
+
+### `GET /{id_or_alias}`
+
+Old-style "legacy" studies which are solely based on tags are no longer supported.
+
+??? info "Affected Legacy Studies"
+
+ Only 24 old studies were affected by this change, listed below.
+ There is currently not yet a migration plan for these studies.
+
+ | id | name|
+ | --: | :-- |
+ |1 |A large-scale comparison of classification algorit...|
+ |2 |Fast Algorithm Selection using Learning Curves|
+ |3 |Multi-Task Learning with a Natural Metric for Quan...|
+ |5 |Local and Global Feature Selection on Multilabel T...|
+ |7 |Massive machine learning experiments using mlr and...|
+ |8 |Decision tree comparaison|
+ |10| Collaborative primer|
+ |11| Having a Blast: Meta-Learning and Heterogeneous En...|
+ |12| Subspace Clustering via Seeking Neighbors with Min...|
+ |13| Meta-QSAR: learning how to learn QSARs|
+ |17| Subgroup Discovery|
+ |20| Mythbusting data mining urban legends through larg...|
+ |22| Identifying critical paths in undergraduate progra...|
+ |24| OpenML R paper|
+ |25| Bernd Demo Study for Multiclass SVMs OML WS 2016|
+ |27| Compare three different SVM versions of R package ...|
+ |30| OpenML Paper Study|
+ |31| Iris Data set Study|
+ |32| Data Streams and more|
+ |34| Massively Collaborative Machine Learning|
+ |37| Speeding up Algorithm Selection via Meta-learning ...|
+ |38| Performance of new ctree implementations on classi...|
+ |41| ASLib OpenML Scenario|
+ |50| Hyper-parameter tuning of Decision Trees|
+ |51| ensemble on diabetes |
+
 ## Others
 
 ### `GET /estimationprocedure/list`

diff --git a/src/core/formatting.py b/src/core/formatting.py
@@ -7,9 +7,9 @@
 
 
 def _str_to_bool(string: str) -> bool:
- if string.casefold() in ["true", "1", "yes"]:
+ if string.casefold() in ["true", "1", "yes", "y"]:
  return True
- if string.casefold() in ["false", "0", "no"]:
+ if string.casefold() in ["false", "0", "no", "n"]:
  return False
  msg = f"Could not parse {string=} as bool."
  raise ValueError(msg)

diff --git a/src/database/studies.py b/src/database/studies.py
@@ -0,0 +1,68 @@
+from typing import cast
+
+from schemas.study import StudyType
+from sqlalchemy import Connection, Row, text
+
+
+def get_study_by_id(study_id: int, connection: Connection) -> Row:
+ return connection.execute(
+ text(
+ """
+ SELECT *, main_entity_type as type_
+ FROM study
+ WHERE id = :study_id
+ """,
+ ),
+ parameters={"study_id": study_id},
+ ).fetchone()
+
+
+def get_study_by_alias(alias: str, connection: Connection) -> Row:
+ return connection.execute(
+ text(
+ """
+ SELECT *, main_entity_type as type_
+ FROM study
+ WHERE alias = :study_id
+ """,
+ ),
+ parameters={"study_id": alias},
+ ).fetchone()
+
+
+def get_study_data(study: Row, expdb: Connection) -> list[Row]:
+ if study.type_ == StudyType.TASK:
+ return cast(
+ list[Row],
+ expdb.execute(
+ text(
+ """
+ SELECT ts.task_id as task_id, ti.value as data_id
+ FROM task_study as ts LEFT JOIN task_inputs ti ON ts.task_id = ti.task_id
+ WHERE ts.study_id = :study_id AND ti.input = 'source_data'
+ """,
+ ),
+ parameters={"study_id": study.id},
+ ).fetchall(),
+ )
+ return cast(
+ list[Row],
+ expdb.execute(
+ text(
+ """
+ SELECT
+ rs.run_id as run_id,
+ run.task_id as task_id,
+ run.setup as setup_id,
+ ti.value as data_id,
+ setup.implementation_id as flow_id
+ FROM run_study as rs
+ JOIN run ON run.rid = rs.run_id
+ JOIN algorithm_setup as setup ON setup.sid = run.setup
+ JOIN task_inputs as ti ON ti.task_id = run.task_id
+ WHERE rs.study_id = :study_id AND ti.input = 'source_data'
+ """,
+ ),
+ parameters={"study_id": study.id},
+ ).fetchall(),
+ )
diff --git a/src/main.py b/src/main.py
@@ -8,6 +8,7 @@
 from routers.openml.evaluations import router as evaluationmeasures_router
 from routers.openml.flows import router as flows_router
 from routers.openml.qualities import router as qualities_router
+from routers.openml.study import router as study_router
 from routers.openml.tasks import router as task_router
 from routers.openml.tasktype import router as ttype_router
 
@@ -49,6 +50,7 @@ def create_api() -> FastAPI:
  app.include_router(estimationprocedure_router)
  app.include_router(task_router)
  app.include_router(flows_router)
+ app.include_router(study_router)
  return app
 
 

diff --git a/src/routers/openml/study.py b/src/routers/openml/study.py
@@ -0,0 +1,62 @@
+import http.client
+from typing import Annotated
+
+from core.formatting import _str_to_bool
+from database.studies import get_study_by_alias, get_study_by_id, get_study_data
+from database.users import User, UserGroup
+from fastapi import APIRouter, Depends, HTTPException
+from schemas.core import Visibility
+from schemas.study import Study, StudyType
+from sqlalchemy import Connection, Row
+
+from routers.dependencies import expdb_connection, fetch_user
+
+router = APIRouter(prefix="/studies", tags=["studies"])
+
+
+def _get_study_raise_otherwise(id_or_alias: int | str, user: User | None, expdb: Connection) -> Row:
+ if isinstance(id_or_alias, int) or id_or_alias.isdigit():
+ study = get_study_by_id(int(id_or_alias), expdb)
+ else:
+ study = get_study_by_alias(id_or_alias, expdb)
+
+ if study is None:
+ raise HTTPException(status_code=http.client.NOT_FOUND, detail="Study not found.")
+ if study.visibility == Visibility.PRIVATE:
+ if user is None:
+ raise HTTPException(status_code=http.client.UNAUTHORIZED, detail="Study is private.")
+ if study.creator != user.user_id and UserGroup.ADMIN not in user.groups:
+ raise HTTPException(status_code=http.client.FORBIDDEN, detail="Study is private.")
+ if _str_to_bool(study.legacy):
+ raise HTTPException(
+ status_code=http.client.GONE,
+ detail="Legacy studies are no longer supported",
+ )
+
+ return study
+
+
+@router.get("/{alias_or_id}")
+def get_study(
+ alias_or_id: int | str,
+ user: Annotated[User | None, Depends(fetch_user)] = None,
+ expdb: Annotated[Connection, Depends(expdb_connection)] = None,
+) -> Study:
+ study = _get_study_raise_otherwise(alias_or_id, user, expdb)
+ study_data = get_study_data(study, expdb)
+ return Study(
+ id_=study.id,
+ name=study.name,
+ alias=study.alias,
+ main_entity_type=study.type_,
+ description=study.description,
+ visibility=study.visibility,
+ status=study.status,
+ creation_date=study.creation_date,
+ creator=study.creator,
+ data_ids=[row.data_id for row in study_data],
+ task_ids=[row.task_id for row in study_data],
+ run_ids=[row.run_id for row in study_data] if study.type_ == StudyType.RUN else [],
+ flow_ids=[row.flow_id for row in study_data] if study.type_ == StudyType.RUN else [],
+ setup_ids=[row.setup_id for row in study_data] if study.type_ == StudyType.RUN else [],
+ )
diff --git a/src/schemas/core.py b/src/schemas/core.py
@@ -0,0 +1,6 @@
+from enum import StrEnum, auto
+
+
+class Visibility(StrEnum):
+ PUBLIC = auto()
+ PRIVATE = auto()
diff --git a/src/schemas/study.py b/src/schemas/study.py
@@ -0,0 +1,34 @@
+from datetime import datetime
+from enum import StrEnum, auto
+
+from pydantic import BaseModel, Field
+
+from schemas.core import Visibility
+
+
+class StudyType(StrEnum):
+ RUN = auto()
+ TASK = auto()
+
+
+class StudyStatus(StrEnum):
+ ACTIVE = auto()
+ DEACTIVATED = auto()
+ IN_PREPARATION = auto()
+
+
+class Study(BaseModel):
+ id_: int = Field(serialization_alias="id")
+ name: str
+ alias: str | None
+ main_entity_type: StudyType
+ description: str
+ visibility: Visibility
+ status: StudyStatus
+ creation_date: datetime
+ creator: int
+ task_ids: list[int]
+ run_ids: list[int]
+ data_ids: list[int]
+ setup_ids: list[int]
+ flow_ids: list[int]
diff --git a/tests/routers/openml/migration/studies_migration_test.py b/tests/routers/openml/migration/studies_migration_test.py
@@ -0,0 +1,36 @@
+import deepdiff
+import httpx
+import pytest
+from core.conversions import nested_num_to_str, nested_remove_nones
+from starlette.testclient import TestClient
+
+
+@pytest.mark.php()
+def test_get_study_equal(py_api: TestClient, php_api: httpx.Client) -> None:
+ new = py_api.get("/studies/1")
+ old = php_api.get("/study/1")
+ assert new.status_code == old.status_code
+
+ new = new.json()
+ # New implementation is typed
+ new = nested_num_to_str(new)
+ # New implementation has same fields even if empty
+ new = nested_remove_nones(new)
+ new["tasks"] = {"task_id": new.pop("task_ids")}
+ new["data"] = {"data_id": new.pop("data_ids")}
+ if runs := new.pop("run_ids", None):
+ new["runs"] = {"run_id": runs}
+ if flows := new.pop("flow_ids", None):
+ new["flows"] = {"flow_id": flows}
+ if setups := new.pop("setup_ids", None):
+ new["setup"] = {"setup_id": setups}
+
+ # New implementation is not nested
+ new = {"study": new}
+ difference = deepdiff.diff.DeepDiff(
+ new,
+ old.json(),
+ ignore_order=True,
+ ignore_numeric_type_changes=True,
+ )
+ assert not difference