Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Delay transforming priority_order into ndarray #969

Closed
2 changes: 2 additions & 0 deletions .github/workflows/publish-python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ name: Publish Python Package
on:
release:
types: [created]
branches:
- 'release/*'

jobs:
deploy:
Expand Down
6 changes: 3 additions & 3 deletions dataprofiler/labelers/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __new__(
class BaseModel(metaclass=abc.ABCMeta):
"""For labeling data."""

_BaseModel__subclasses: dict[str, type[BaseModel]] = {}
__subclasses: dict[str, type[BaseModel]] = {}
__metaclass__ = abc.ABCMeta

# boolean if the label mapping requires the mapping for index 0 reserved
Expand Down Expand Up @@ -90,7 +90,7 @@ def __eq__(self, other: object) -> bool:
def _register_subclass(cls) -> None:
"""Register a subclass for the class factory."""
if not inspect.isabstract(cls):
cls._BaseModel__subclasses[cls.__name__.lower()] = cls
cls.__subclasses[cls.__name__.lower()] = cls

@property
def label_mapping(self) -> dict[str, int]:
Expand Down Expand Up @@ -156,7 +156,7 @@ def get_class(cls, class_name: str) -> type[BaseModel] | None:
from .column_name_model import ColumnNameModel # NOQA
from .regex_model import RegexModel # NOQA

return cls._BaseModel__subclasses.get(class_name.lower(), None)
return cls.__subclasses.get(class_name.lower(), None)

def get_parameters(self, param_list: list[str] | None = None) -> dict:
"""
Expand Down
58 changes: 31 additions & 27 deletions dataprofiler/labelers/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,14 @@ def __init__(self, **parameters: Any) -> None:
def _register_subclass(cls) -> None:
"""Register a subclass for the class factory."""
if not inspect.isabstract(cls):
cls._BaseDataProcessor__subclasses[ # type: ignore
cls.__name__.lower()
] = cls
cls.__subclasses[cls.__name__.lower()] = cls

@classmethod
def get_class(cls: type[Processor], class_name: str) -> type[Processor] | None:
def get_class(
cls: type[BaseDataProcessor], class_name: str
) -> type[BaseDataProcessor] | None:
"""Get class of BaseDataProcessor object."""
return cls._BaseDataProcessor__subclasses.get( # type: ignore
class_name.lower(), None
)
return cls.__subclasses.get(class_name.lower(), None)

def __eq__(self, other: object) -> bool:
"""
Expand Down Expand Up @@ -129,7 +127,7 @@ def set_params(self, **kwargs: Any) -> None:
self._parameters[param] = kwargs[param]

@abc.abstractmethod
def process(self, *args: Any) -> Any:
def process(self, *args: Any, **kwargs: Any) -> Any:
"""Process data."""
raise NotImplementedError()

Expand Down Expand Up @@ -169,13 +167,15 @@ def __init__(self, **parameters: Any) -> None:
super().__init__(**parameters)

@abc.abstractmethod
def process( # type: ignore
def process(
self,
data: np.ndarray,
labels: np.ndarray | None = None,
label_mapping: dict[str, int] | None = None,
batch_size: int = 32,
) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None]:
) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] | tuple[
np.ndarray, np.ndarray
] | np.ndarray:
"""Preprocess data."""
raise NotImplementedError()

Expand All @@ -191,7 +191,7 @@ def __init__(self, **parameters: Any) -> None:
super().__init__(**parameters)

@abc.abstractmethod
def process( # type: ignore
def process(
self,
data: np.ndarray,
results: dict,
Expand Down Expand Up @@ -240,7 +240,7 @@ def help(cls) -> None:
)
print(help_str)

def process( # type: ignore
def process(
self,
data: np.ndarray,
labels: np.ndarray | None = None,
Expand Down Expand Up @@ -668,7 +668,7 @@ def gen_none() -> Generator[None, None, None]:
if batch_data["samples"]:
yield batch_data

def process( # type: ignore
def process(
self,
data: np.ndarray,
labels: np.ndarray | None = None,
Expand Down Expand Up @@ -735,8 +735,8 @@ def process( # type: ignore
X_train = np.array(
[[sentence] for sentence in batch_data["samples"]], dtype=object
)
if labels is not None:
num_classes = max(label_mapping.values()) + 1 # type: ignore
if labels is not None and label_mapping is not None:
num_classes = max(label_mapping.values()) + 1

Y_train = tf.keras.utils.to_categorical(
batch_data["labels"], num_classes
Expand Down Expand Up @@ -836,7 +836,7 @@ def _validate_parameters(self, parameters: dict) -> None:
if errors:
raise ValueError("\n".join(errors))

def process( # type: ignore
def process(
self,
data: np.ndarray,
labels: np.ndarray | None = None,
Expand Down Expand Up @@ -1269,7 +1269,7 @@ def match_sentence_lengths(

return results

def process( # type: ignore
def process(
self,
data: np.ndarray,
results: dict,
Expand Down Expand Up @@ -1439,7 +1439,7 @@ def convert_to_unstructured_format(

return text, entities

def process( # type: ignore
def process(
self,
data: np.ndarray,
labels: np.ndarray | None = None,
Expand Down Expand Up @@ -1503,8 +1503,12 @@ def process( # type: ignore
unstructured_label_set,
) = self.convert_to_unstructured_format(batch_data, batch_labels)
unstructured_data[ind] = unstructured_text
if labels is not None:
unstructured_labels[ind] = unstructured_label_set # type: ignore
if (
labels is not None
and unstructured_labels is not None
and unstructured_label_set is not None
):
unstructured_labels[ind] = unstructured_label_set

if labels is not None:
np_unstruct_labels = np.array(unstructured_labels, dtype="object")
Expand Down Expand Up @@ -1800,7 +1804,7 @@ def convert_to_structured_analysis(

return results

def process( # type: ignore
def process(
self,
data: np.ndarray,
results: dict,
Expand Down Expand Up @@ -2022,7 +2026,7 @@ def split_prediction(results: dict) -> None:
pred, axis=1, ord=1, keepdims=True
)

def process( # type: ignore
def process(
self,
data: np.ndarray,
results: dict,
Expand All @@ -2043,9 +2047,9 @@ def process( # type: ignore
elif aggregation_func == "random":
num_labels = max(label_mapping.values()) + 1
random_state: random.Random = self._parameters["random_state"]
priority_order = np.array(list(range(num_labels)))
random_state.shuffle(priority_order) # type: ignore
self.priority_prediction(results, priority_order)
priority_order = list(range(num_labels))
random_state.shuffle(priority_order)
self.priority_prediction(results, np.array(priority_order))
else:
raise ValueError(
f"`{aggregation_func}` is not a valid aggregation function"
Expand Down Expand Up @@ -2160,7 +2164,7 @@ def _save_processor(self, dirpath: str) -> None:
) as fp:
json.dump(params, fp)

def process( # type: ignore
def process(
self,
data: np.ndarray,
results: dict,
Expand Down Expand Up @@ -2253,7 +2257,7 @@ def help(cls) -> None:
)
print(help_str)

def process( # type: ignore
def process(
self,
data: np.ndarray,
results: dict,
Expand Down
16 changes: 15 additions & 1 deletion dataprofiler/profilers/graph_profiler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Class and functions to calculate and profile properties of graph data."""
from __future__ import annotations

import importlib
import pickle
from collections import defaultdict
from datetime import datetime
Expand All @@ -10,6 +11,7 @@
import numpy as np
import pandas as pd
import scipy.stats as st
from packaging import version

from ..data_readers.graph_data import GraphData
from . import utils
Expand Down Expand Up @@ -391,6 +393,11 @@ def _get_continuous_distribution(
st.lognorm,
st.gamma,
]

scipy_gte_1_11_0 = version.parse(
importlib.metadata.version("scipy")
) >= version.parse("1.11.0")

for attribute in attributes:
if attribute in continuous_attributes:
data_as_list = self._attribute_data_as_list(graph, attribute)
Expand All @@ -401,7 +408,14 @@ def _get_continuous_distribution(

for distribution in distribution_candidates:
# compute fit, mle, kolmogorov-smirnov test to test fit, and pdf
fit = distribution.fit(df)

# scipy 1.11.0 updated the way they handle
# the loc parameter in fit() for lognorm
if distribution == st.lognorm and scipy_gte_1_11_0:
fit = distribution.fit(df, superfit=True)

else:
fit = distribution.fit(df)
mle = distribution.nnlf(fit, df)

if mle <= best_mle:
Expand Down
Loading