capitalone · junholee6a · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 12, 2023
diff --git a/.github/workflows/publish-python-package.yml b/.github/workflows/publish-python-package.yml
@@ -7,6 +7,8 @@ name: Publish Python Package
 on:
   release:
     types: [created]
+    branches:
+      - 'release/*'
 
 jobs:
   deploy:

diff --git a/dataprofiler/labelers/base_model.py b/dataprofiler/labelers/base_model.py
@@ -32,7 +32,7 @@ def __new__(
 class BaseModel(metaclass=abc.ABCMeta):
     """For labeling data."""
 
-    _BaseModel__subclasses: dict[str, type[BaseModel]] = {}
+    __subclasses: dict[str, type[BaseModel]] = {}
     __metaclass__ = abc.ABCMeta
 
     # boolean if the label mapping requires the mapping for index 0 reserved
@@ -90,7 +90,7 @@ def __eq__(self, other: object) -> bool:
     def _register_subclass(cls) -> None:
         """Register a subclass for the class factory."""
         if not inspect.isabstract(cls):
-            cls._BaseModel__subclasses[cls.__name__.lower()] = cls
+            cls.__subclasses[cls.__name__.lower()] = cls
 
     @property
     def label_mapping(self) -> dict[str, int]:
@@ -156,7 +156,7 @@ def get_class(cls, class_name: str) -> type[BaseModel] | None:
         from .column_name_model import ColumnNameModel  # NOQA
         from .regex_model import RegexModel  # NOQA
 
-        return cls._BaseModel__subclasses.get(class_name.lower(), None)
+        return cls.__subclasses.get(class_name.lower(), None)
 
     def get_parameters(self, param_list: list[str] | None = None) -> dict:
         """

diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py
@@ -49,16 +49,14 @@ def __init__(self, **parameters: Any) -> None:
     def _register_subclass(cls) -> None:
         """Register a subclass for the class factory."""
         if not inspect.isabstract(cls):
-            cls._BaseDataProcessor__subclasses[  # type: ignore
-                cls.__name__.lower()
-            ] = cls
+            cls.__subclasses[cls.__name__.lower()] = cls
 
     @classmethod
-    def get_class(cls: type[Processor], class_name: str) -> type[Processor] | None:
+    def get_class(
+        cls: type[BaseDataProcessor], class_name: str
+    ) -> type[BaseDataProcessor] | None:
         """Get class of BaseDataProcessor object."""
-        return cls._BaseDataProcessor__subclasses.get(  # type: ignore
-            class_name.lower(), None
-        )
+        return cls.__subclasses.get(class_name.lower(), None)
 
     def __eq__(self, other: object) -> bool:
         """
@@ -129,7 +127,7 @@ def set_params(self, **kwargs: Any) -> None:
             self._parameters[param] = kwargs[param]
 
     @abc.abstractmethod
-    def process(self, *args: Any) -> Any:
+    def process(self, *args: Any, **kwargs: Any) -> Any:
         """Process data."""
         raise NotImplementedError()
 
@@ -169,13 +167,15 @@ def __init__(self, **parameters: Any) -> None:
         super().__init__(**parameters)
 
     @abc.abstractmethod
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
         label_mapping: dict[str, int] | None = None,
         batch_size: int = 32,
-    ) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None]:
+    ) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] | tuple[
+        np.ndarray, np.ndarray
+    ] | np.ndarray:
         """Preprocess data."""
         raise NotImplementedError()
 
@@ -191,7 +191,7 @@ def __init__(self, **parameters: Any) -> None:
         super().__init__(**parameters)
 
     @abc.abstractmethod
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -240,7 +240,7 @@ def help(cls) -> None:
         )
         print(help_str)
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
@@ -668,7 +668,7 @@ def gen_none() -> Generator[None, None, None]:
         if batch_data["samples"]:
             yield batch_data
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
@@ -735,8 +735,8 @@ def process(  # type: ignore
             X_train = np.array(
                 [[sentence] for sentence in batch_data["samples"]], dtype=object
             )
-            if labels is not None:
-                num_classes = max(label_mapping.values()) + 1  # type: ignore
+            if labels is not None and label_mapping is not None:
+                num_classes = max(label_mapping.values()) + 1
 
                 Y_train = tf.keras.utils.to_categorical(
                     batch_data["labels"], num_classes
@@ -836,7 +836,7 @@ def _validate_parameters(self, parameters: dict) -> None:
         if errors:
             raise ValueError("\n".join(errors))
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
@@ -1269,7 +1269,7 @@ def match_sentence_lengths(
 
         return results
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -1439,7 +1439,7 @@ def convert_to_unstructured_format(
 
         return text, entities
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
@@ -1503,8 +1503,12 @@ def process(  # type: ignore
                 unstructured_label_set,
             ) = self.convert_to_unstructured_format(batch_data, batch_labels)
             unstructured_data[ind] = unstructured_text
-            if labels is not None:
-                unstructured_labels[ind] = unstructured_label_set  # type: ignore
+            if (
+                labels is not None
+                and unstructured_labels is not None
+                and unstructured_label_set is not None
+            ):
+                unstructured_labels[ind] = unstructured_label_set
 
         if labels is not None:
             np_unstruct_labels = np.array(unstructured_labels, dtype="object")
@@ -1800,7 +1804,7 @@ def convert_to_structured_analysis(
 
         return results
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -2022,7 +2026,7 @@ def split_prediction(results: dict) -> None:
                 pred, axis=1, ord=1, keepdims=True
             )
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -2043,9 +2047,9 @@ def process(  # type: ignore
         elif aggregation_func == "random":
             num_labels = max(label_mapping.values()) + 1
             random_state: random.Random = self._parameters["random_state"]
-            priority_order = np.array(list(range(num_labels)))
-            random_state.shuffle(priority_order)  # type: ignore
-            self.priority_prediction(results, priority_order)
+            priority_order = list(range(num_labels))
+            random_state.shuffle(priority_order)
+            self.priority_prediction(results, np.array(priority_order))
         else:
             raise ValueError(
                 f"`{aggregation_func}` is not a valid aggregation function"
@@ -2160,7 +2164,7 @@ def _save_processor(self, dirpath: str) -> None:
         ) as fp:
             json.dump(params, fp)
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -2253,7 +2257,7 @@ def help(cls) -> None:
         )
         print(help_str)
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,

diff --git a/dataprofiler/profilers/graph_profiler.py b/dataprofiler/profilers/graph_profiler.py
@@ -1,6 +1,7 @@
 """Class and functions to calculate and profile properties of graph data."""
 from __future__ import annotations
 
+import importlib
 import pickle
 from collections import defaultdict
 from datetime import datetime
@@ -10,6 +11,7 @@
 import numpy as np
 import pandas as pd
 import scipy.stats as st
+from packaging import version
 
 from ..data_readers.graph_data import GraphData
 from . import utils
@@ -391,6 +393,11 @@ def _get_continuous_distribution(
             st.lognorm,
             st.gamma,
         ]
+
+        scipy_gte_1_11_0 = version.parse(
+            importlib.metadata.version("scipy")
+        ) >= version.parse("1.11.0")
+
         for attribute in attributes:
             if attribute in continuous_attributes:
                 data_as_list = self._attribute_data_as_list(graph, attribute)
@@ -401,7 +408,14 @@ def _get_continuous_distribution(
 
                 for distribution in distribution_candidates:
                     # compute fit, mle, kolmogorov-smirnov test to test fit, and pdf
-                    fit = distribution.fit(df)
+
+                    # scipy 1.11.0 updated the way they handle
+                    # the loc parameter in fit() for lognorm
+                    if distribution == st.lognorm and scipy_gte_1_11_0:
+                        fit = distribution.fit(df, superfit=True)
+
+                    else:
+                        fit = distribution.fit(df)
                     mle = distribution.nnlf(fit, df)
 
                     if mle <= best_mle: