Skip to content

Commit

Permalink
feat: Restructure packages to streamline the addition of new standards
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez committed Oct 22, 2024
1 parent c82c887 commit c951e99
Show file tree
Hide file tree
Showing 10 changed files with 20,116 additions and 157,943 deletions.
55 changes: 27 additions & 28 deletions bdikit/api.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,18 @@
from __future__ import annotations
import logging

from collections import defaultdict
from os.path import join, dirname
from typing import (
Union,
List,
Dict,
TypedDict,
Optional,
Tuple,
Callable,
Any,
)
import itertools
import pandas as pd
import numpy as np
import panel as pn
from IPython.display import display, Markdown
from bdikit.utils import get_gdc_data, get_gdc_metadata

from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.schema_matching.best.matcher_factory import SchemaMatchers
from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
from bdikit.schema_matching.topk.matcher_factory import TopkMatchers
from bdikit.value_matching.base import BaseValueMatcher, ValueMatch, ValueMatchingResult
from bdikit.value_matching.matcher_factory import ValueMatchers
from bdikit.standards.standard_factory import Standards

from bdikit.mapping_functions import (
ValueMapper,
Expand All @@ -34,11 +21,21 @@
IdentityValueMapper,
)

from typing import (
Union,
List,
Dict,
TypedDict,
Optional,
Tuple,
Callable,
Any,
)

from bdikit.config import DEFAULT_SCHEMA_MATCHING_METHOD, DEFAULT_VALUE_MATCHING_METHOD

pn.extension("tabulator")

GDC_DATA_PATH = join(dirname(__file__), "./resource/gdc_table.csv")
DEFAULT_VALUE_MATCHING_METHOD = "tfidf"
DEFAULT_SCHEMA_MATCHING_METHOD = "coma"
logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -92,10 +89,10 @@ def _load_table_for_standard(name: str) -> pd.DataFrame:
Load the table for the given standard data vocabulary. Currently, only the
GDC standard is supported.
"""
if name == "gdc":
return pd.read_csv(GDC_DATA_PATH)
else:
raise ValueError(f"The {name} standard is not supported")
standard = Standards.get_instance(name)
df = standard.get_dataframe_rep()

return df


def top_matches(
Expand Down Expand Up @@ -439,9 +436,10 @@ def _format_value_matching_input(
f"The source column '{source_column}' is not present in the source dataset."
)

if isinstance(target, str) and target == "gdc":
if isinstance(target, str):
column_names = mapping_df["target"].unique().tolist()
target_domain = get_gdc_data(column_names)
standard = Standards.get_instance(target)
target_domain = standard.get_column_values(column_names)
elif isinstance(target, pd.DataFrame):
target_domain = {
column_name: target[column_name].unique().tolist()
Expand Down Expand Up @@ -518,11 +516,12 @@ def preview_domain(
(if applicable).
"""

if isinstance(dataset, str) and dataset == "gdc":
gdc_metadata = get_gdc_metadata()
value_names = gdc_metadata[column]["value_names"]
value_descriptions = gdc_metadata[column]["value_descriptions"]
column_description = gdc_metadata[column]["description"]
if isinstance(dataset, str):
standard = Standards.get_instance(dataset)
column_metadata = standard.get_column_metadata([column])
value_names = column_metadata[column]["value_names"]
value_descriptions = column_metadata[column]["value_descriptions"]
column_description = column_metadata[column]["description"]
assert len(value_names) == len(value_descriptions)
elif isinstance(dataset, pd.DataFrame):
value_names = dataset[column].unique()
Expand Down
2 changes: 2 additions & 0 deletions bdikit/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

BDIKIT_DEVICE: str = os.getenv("BDIKIT_DEVICE", default="cpu")
VALUE_MATCHING_THRESHOLD = 0.3
DEFAULT_VALUE_MATCHING_METHOD = "tfidf"
DEFAULT_SCHEMA_MATCHING_METHOD = "coma"


def get_device() -> str:
Expand Down
Loading

0 comments on commit c951e99

Please sign in to comment.