From b63b25d1a3cd37c308bd1cbd93048294fcb2fe62 Mon Sep 17 00:00:00 2001
From: Liz Smith <liz.smith@richmond.edu>
Date: Thu, 6 Jul 2023 13:54:59 -0500
Subject: [PATCH 1/8] let's try this again (#953)

---
 .../tests/profilers/test_profile_builder.py   | 110 +++++++++++++-----
 1 file changed, 82 insertions(+), 28 deletions(-)

diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py
index 7a309e946..a4326492f 100644
--- a/dataprofiler/tests/profilers/test_profile_builder.py
+++ b/dataprofiler/tests/profilers/test_profile_builder.py
@@ -327,7 +327,12 @@ def test_correlation(self, *mock):
         # sum((x - np.mean(x))*(y-np.mean(y))) /
         # np.sqrt(sum((x - np.mean(x)**2)))/np.sqrt(sum((y - np.mean(y)**2)))
         profile_options = dp.ProfilerOptions()
-        profile_options.set({"correlation.is_enabled": True})
+        profile_options.set(
+            {
+                "correlation.is_enabled": True,
+                "structured_options.multiprocess.is_enabled": False,
+            }
+        )
 
         # data with a sole numeric column
         data = pd.DataFrame([1.0, 8.0, 1.0, -2.0, 5.0])
@@ -580,7 +585,12 @@ def test_merge_correlation(self, *mocks):
 
     def test_correlation_update(self):
         profile_options = dp.ProfilerOptions()
-        profile_options.set({"correlation.is_enabled": True})
+        profile_options.set(
+            {
+                "correlation.is_enabled": True,
+                "structured_options.multiprocess.is_enabled": False,
+            }
+        )
 
         # Test with all numeric columns
         data = pd.DataFrame(
@@ -776,12 +786,14 @@ def test_correlation_selected_columns(self, *mocks):
     def test_chi2(self, *mocks):
         # Empty
         data = pd.DataFrame([])
-        profiler = dp.StructuredProfiler(data)
+        profile_options = dp.ProfilerOptions()
+        profile_options.set({"structured_options.multiprocess.is_enabled": False})
+        profiler = dp.StructuredProfiler(data, options=profile_options)
         self.assertIsNone(profiler.chi2_matrix)
 
         # Single column
         data = pd.DataFrame({"a": ["y", "y", "n", "n", "y"]})
-        profiler = dp.StructuredProfiler(data)
+        profiler = dp.StructuredProfiler(data, options=profile_options)
         expected_mat = np.array([1])
         self.assertEqual(expected_mat, profiler.chi2_matrix)
 
@@ -793,7 +805,7 @@ def test_chi2(self, *mocks):
             }
         )
 
-        profiler = dp.StructuredProfiler(data)
+        profiler = dp.StructuredProfiler(data, options=profile_options)
         expected_mat = np.array(
             [[1, 0.309924, 0.404638], [0.309924, 1, 0.548812], [0.404638, 0.548812, 1]]
         )
@@ -808,7 +820,7 @@ def test_chi2(self, *mocks):
             }
         )
 
-        profiler = dp.StructuredProfiler(data)
+        profiler = dp.StructuredProfiler(data, options=profile_options)
         expected_mat = np.array(
             [[1, 0.007295, 0.007295], [0.007295, 1, 0.015609], [0.007295, 0.015609, 1]]
         )
@@ -823,7 +835,7 @@ def test_chi2(self, *mocks):
             }
         )
 
-        profiler = dp.StructuredProfiler(data)
+        profiler = dp.StructuredProfiler(data, options=profile_options)
         expected_mat = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]])
         np.testing.assert_array_almost_equal(expected_mat, profiler.chi2_matrix)
 
@@ -840,8 +852,10 @@ def test_merge_chi2(self, *mocks):
                 "c": ["n", "maybe", "n", "n", "n", "y", "y"],
             }
         )
-        profiler1 = dp.StructuredProfiler(None)
-        profiler2 = dp.StructuredProfiler(data)
+        profile_options = dp.ProfilerOptions()
+        profile_options.set({"structured_options.multiprocess.is_enabled": False})
+        profiler1 = dp.StructuredProfiler(None, options=profile_options)
+        profiler2 = dp.StructuredProfiler(data, options=profile_options)
         with mock.patch(
             "dataprofiler.profilers.profile_builder."
             "StructuredProfiler._add_error_checks"
@@ -862,8 +876,8 @@ def test_merge_chi2(self, *mocks):
 
         data1 = data[:4]
         data2 = data[4:]
-        profiler1 = dp.StructuredProfiler(data1)
-        profiler2 = dp.StructuredProfiler(data2)
+        profiler1 = dp.StructuredProfiler(data1, options=profile_options)
+        profiler2 = dp.StructuredProfiler(data2, options=profile_options)
         profiler3 = profiler1 + profiler2
         expected_mat = np.array(
             [[1, 0.309924, 0.404638], [0.309924, 1, 0.548812], [0.404638, 0.548812, 1]]
@@ -880,8 +894,8 @@ def test_merge_chi2(self, *mocks):
         )
         data1 = data[:4]
         data2 = data[4:]
-        profiler1 = dp.StructuredProfiler(data1)
-        profiler2 = dp.StructuredProfiler(data2)
+        profiler1 = dp.StructuredProfiler(data1, options=profile_options)
+        profiler2 = dp.StructuredProfiler(data2, options=profile_options)
         profiler3 = profiler1 + profiler2
         expected_mat = np.array(
             [[1, 0.007295, 0.007295], [0.007295, 1, 0.015609], [0.007295, 0.015609, 1]]
@@ -918,7 +932,9 @@ def test_update_chi2(self, *mocks):
             }
         )
         data2 = pd.DataFrame({"a": [], "b": [], "c": []})
-        profiler = dp.StructuredProfiler(data1)
+        profile_options = dp.ProfilerOptions()
+        profile_options.set({"structured_options.multiprocess.is_enabled": False})
+        profiler = dp.StructuredProfiler(data1, options=profile_options)
         profiler.update_profile(data2)
         expected_mat = np.array(
             [[1, 0.309924, 0.404638], [0.309924, 1, 0.548812], [0.404638, 0.548812, 1]]
@@ -934,7 +950,7 @@ def test_update_chi2(self, *mocks):
         )
         data1 = data[:4]
         data2 = data[4:]
-        profiler = dp.StructuredProfiler(data1)
+        profiler = dp.StructuredProfiler(data1, options=profile_options)
         profiler.update_profile(data2)
         expected_mat = np.array(
             [[1, 0.309924, 0.404638], [0.309924, 1, 0.548812], [0.404638, 0.548812, 1]]
@@ -952,7 +968,7 @@ def test_update_chi2(self, *mocks):
 
         data1 = data[:4]
         data2 = data[4:]
-        profiler = dp.StructuredProfiler(data1)
+        profiler = dp.StructuredProfiler(data1, options=profile_options)
         profiler.update_profile(data2)
         expected_mat = np.array(
             [[1, 0.007295, 0.007295], [0.007295, 1, 0.015609], [0.007295, 0.015609, 1]]
@@ -969,7 +985,7 @@ def test_update_chi2(self, *mocks):
         )
         data1 = data[:4]
         data2 = data[4:]
-        profiler = dp.StructuredProfiler(data1)
+        profiler = dp.StructuredProfiler(data1, options=profile_options)
         profiler.update_profile(data2)
         expected_mat = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]])
         np.testing.assert_array_almost_equal(expected_mat, profiler.chi2_matrix)
@@ -1203,7 +1219,12 @@ def test_report_remove_disabled_flag(self):
         # with options to disable FloatColumn `precision`
         # and with remove_disabled_flag == True
         profiler_options = ProfilerOptions()
-        profiler_options.set({"precision.is_enabled": False})
+        profiler_options.set(
+            {
+                "precision.is_enabled": False,
+                "structured_options.multiprocess.is_enabled": False,
+            }
+        )
         profiler = dp.StructuredProfiler(data=data, options=profiler_options)
         report = profiler.report(report_options={"remove_disabled_flag": True})
 
@@ -1215,7 +1236,12 @@ def test_report_remove_disabled_flag(self):
         # with options to disable NumericalMixIn cal `min`
         # and with remove_disabled_flag == True
         profiler_options = ProfilerOptions()
-        profiler_options.set({"min.is_enabled": False})
+        profiler_options.set(
+            {
+                "min.is_enabled": False,
+                "structured_options.multiprocess.is_enabled": False,
+            }
+        )
         profiler = dp.StructuredProfiler(data=data, options=profiler_options)
         report = profiler.report(report_options={"remove_disabled_flag": True})
 
@@ -1225,7 +1251,12 @@ def test_report_remove_disabled_flag(self):
         # with options to disable TextColumn cal `vocab`
         # and with remove_disabled_flag == True
         profiler_options = ProfilerOptions()
-        profiler_options.set({"vocab.is_enabled": False})
+        profiler_options.set(
+            {
+                "vocab.is_enabled": False,
+                "structured_options.multiprocess.is_enabled": False,
+            }
+        )
         profiler = dp.StructuredProfiler(data=data, options=profiler_options)
         report = profiler.report(report_options={"remove_disabled_flag": True})
 
@@ -1234,7 +1265,12 @@ def test_report_remove_disabled_flag(self):
 
         # with profiler options and default remove_disabled_flag
         profiler_options = ProfilerOptions()
-        profiler_options.set({"min.is_enabled": False})
+        profiler_options.set(
+            {
+                "min.is_enabled": False,
+                "structured_options.multiprocess.is_enabled": False,
+            }
+        )
         profiler = dp.StructuredProfiler(data=data, options=profiler_options)
         report = profiler.report()
 
@@ -1242,7 +1278,9 @@ def test_report_remove_disabled_flag(self):
             self.assertIn("min", report["data_stats"][iter_value]["statistics"])
 
         # w/o profiler options and default remove_disabled_flag
-        profiler = dp.StructuredProfiler(data=data)
+        profile_options = dp.ProfilerOptions()
+        profile_options.set({"structured_options.multiprocess.is_enabled": False})
+        profiler = dp.StructuredProfiler(data=data, options=profiler_options)
         report = profiler.report()
 
         for iter_value in range(0, len(data.columns) - 1):
@@ -1370,7 +1408,11 @@ def recursive_test_helper(report, prev_key=None):
 
     def test_data_label_assigned(self):
         # only use 5 samples
-        trained_schema = dp.StructuredProfiler(self.aws_dataset, samples_per_update=5)
+        profile_options = dp.ProfilerOptions()
+        profile_options.set({"structured_options.multiprocess.is_enabled": False})
+        trained_schema = dp.StructuredProfiler(
+            self.aws_dataset, samples_per_update=5, options=profile_options
+        )
         report = trained_schema.report()
         has_non_null_column = False
         for i in range(len(report["data_stats"])):
@@ -1754,7 +1796,10 @@ def test_duplicate_columns(self):
             [[1, 2, 3, 4, 5, 6], [10, 20, 30, 40, 50, 60]],
             columns=["a", "b", "a", "b", "c", "d"],
         )
-        profiler = dp.StructuredProfiler(data)
+        profile_options = dp.ProfilerOptions()
+        profile_options.set({"structured_options.multiprocess.is_enabled": False})
+
+        profiler = dp.StructuredProfiler(data, options=profile_options)
 
         # Ensure columns are correctly allocated to profiles in list
         expected_mapping = {"a": [0, 2], "b": [1, 3], "c": [4], "d": [5]}
@@ -1812,9 +1857,11 @@ def test_unique_col_permutation(self, *mocks):
         perm_data = pd.DataFrame(
             [[4, 3, 2, 1], [8, 7, 6, 5]], columns=["d", "c", "b", "a"]
         )
+        profile_options = dp.ProfilerOptions()
+        profile_options.set({"structured_options.multiprocess.is_enabled": False})
 
         # Test via add
-        first_profiler = dp.StructuredProfiler(data)
+        first_profiler = dp.StructuredProfiler(data, options=profile_options)
         perm_profiler = dp.StructuredProfiler(perm_data)
         profiler = first_profiler + perm_profiler
 
@@ -1834,7 +1881,7 @@ def test_unique_col_permutation(self, *mocks):
             )
 
         # Test via update
-        profiler = dp.StructuredProfiler(data)
+        profiler = dp.StructuredProfiler(data, options=profile_options)
         profiler.update_profile(perm_data)
 
         for col_idx in range(len(profiler._profile)):
@@ -4047,11 +4094,13 @@ def test_report_remove_disabled_flag(self):
     def test_save_and_load_pkl_file(self):
         data_folder = "dataprofiler/tests/data/"
         test_files = ["txt/code.txt", "txt/sentence-10x.txt"]
+        profile_options = dp.ProfilerOptions()
+        profile_options.set({"structured_options.multiprocess.is_enabled": False})
 
         for test_file in test_files:
             # Create Data and StructuredProfiler objects
             data = dp.Data(os.path.join(data_folder, test_file))
-            save_profile = UnstructuredProfiler(data)
+            save_profile = UnstructuredProfiler(data, options=profile_options)
 
             # If profile _empty_line_count = 0, it won't test if the variable is
             # saved correctly since that is also the default value. Ensure
@@ -4112,7 +4161,12 @@ def test_save_and_load_no_labeler(self):
         data = "this is my test data: 123-456-7890"
 
         profile_options = dp.ProfilerOptions()
-        profile_options.set({"data_labeler.is_enabled": False})
+        profile_options.set(
+            {
+                "data_labeler.is_enabled": False,
+                "structured_options.multiprocess.is_enabled": False,
+            }
+        )
 
         save_profile = dp.UnstructuredProfiler(data, options=profile_options)
 

From ad6ab5849d6023b16147dc5246d9bc5468764eed Mon Sep 17 00:00:00 2001
From: Navid Nafiuzzaman <mxn4459@rit.edu>
Date: Thu, 6 Jul 2023 15:27:08 -0400
Subject: [PATCH 2/8] Fix/f1 score path fix import (#952)

* Fixed F1Score Import

* Linted example file with Black Linter
---
 examples/add_new_model_to_data_labeler.ipynb | 220 ++++++++++++-------
 1 file changed, 135 insertions(+), 85 deletions(-)

diff --git a/examples/add_new_model_to_data_labeler.ipynb b/examples/add_new_model_to_data_labeler.ipynb
index 3f59297bc..1495e6a85 100644
--- a/examples/add_new_model_to_data_labeler.ipynb
+++ b/examples/add_new_model_to_data_labeler.ipynb
@@ -1,6 +1,7 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "228bb2a6",
    "metadata": {},
@@ -9,6 +10,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "cab7a569",
    "metadata": {},
@@ -39,6 +41,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "e90728ab",
    "metadata": {},
@@ -47,6 +50,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "3d61981c",
    "metadata": {},
@@ -75,6 +79,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "745ed0d4",
    "metadata": {},
@@ -83,6 +88,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "7375b0c0",
    "metadata": {},
@@ -105,15 +111,19 @@
    "source": [
     "import tensorflow as tf\n",
     "import numpy as np\n",
-    "from dataprofiler.labelers.character_level_cnn_model import CharacterLevelCnnModel, F1Score, \\\n",
-    "                                                            create_glove_char, build_embd_dictionary\n",
+    "from dataprofiler.labelers.character_level_cnn_model import (\n",
+    "    CharacterLevelCnnModel,\n",
+    "    create_glove_char,\n",
+    "    build_embd_dictionary,\n",
+    ")\n",
     "from dataprofiler.labelers.base_model import BaseModel\n",
+    "from dataprofiler.labelers.labeler_utils import F1Score\n",
+    "\n",
     "\n",
     "# CharacterLevelLstmModel derives from CharacterLevelCnnModel\n",
     "#########################################################\n",
     "#########################################################\n",
     "class CharacterLevelLstmModel(CharacterLevelCnnModel):\n",
-    "\n",
     "    # boolean if the label mapping requires the mapping for index 0 reserved\n",
     "    requires_zero_mapping = True\n",
     "\n",
@@ -121,26 +131,26 @@
     "        \"\"\"\n",
     "        LSTM Model Initializer\n",
     "        \"\"\"\n",
-    "                \n",
+    "\n",
     "        # parameter initialization\n",
     "        if not parameters:\n",
     "            parameters = {}\n",
-    "        parameters.setdefault('max_length', 3400)\n",
-    "        parameters.setdefault('max_char_encoding_id', 127)\n",
-    "        parameters.setdefault('dim_embed', 64)\n",
-    "        parameters.setdefault('size_fc', [32, 32])\n",
-    "        parameters.setdefault('dropout', 0.1)\n",
+    "        parameters.setdefault(\"max_length\", 3400)\n",
+    "        parameters.setdefault(\"max_char_encoding_id\", 127)\n",
+    "        parameters.setdefault(\"dim_embed\", 64)\n",
+    "        parameters.setdefault(\"size_fc\", [32, 32])\n",
+    "        parameters.setdefault(\"dropout\", 0.1)\n",
     "        # new parameters for LSTM model\n",
     "        #########################################################\n",
     "        #########################################################\n",
-    "        parameters.setdefault('size_lstm', [64])\n",
-    "        parameters.setdefault('rec_dropout', 0.1)\n",
-    "        parameters.setdefault('activation', \"tanh\")\n",
-    "        parameters.setdefault('recurrent_activation', \"sigmoid\")\n",
+    "        parameters.setdefault(\"size_lstm\", [64])\n",
+    "        parameters.setdefault(\"rec_dropout\", 0.1)\n",
+    "        parameters.setdefault(\"activation\", \"tanh\")\n",
+    "        parameters.setdefault(\"recurrent_activation\", \"sigmoid\")\n",
     "        #########################################################\n",
     "        #########################################################\n",
-    "        parameters.setdefault('default_label', \"UNKNOWN\")\n",
-    "        parameters['pad_label'] = 'PAD'\n",
+    "        parameters.setdefault(\"default_label\", \"UNKNOWN\")\n",
+    "        parameters[\"pad_label\"] = \"PAD\"\n",
     "        self._epoch_id = 0\n",
     "\n",
     "        # reconstruct flags for model\n",
@@ -155,36 +165,66 @@
     "        present.\n",
     "        \"\"\"\n",
     "        errors = []\n",
-    "        list_of_necessary_params = ['max_length', 'max_char_encoding_id',\n",
-    "                                    'dim_embed', 'size_fc', 'dropout',\n",
-    "                                    'size_lstm', 'rec_dropout', 'activation', \n",
-    "                                    'recurrent_activation', 'default_label', \n",
-    "                                    'pad_label']\n",
+    "        list_of_necessary_params = [\n",
+    "            \"max_length\",\n",
+    "            \"max_char_encoding_id\",\n",
+    "            \"dim_embed\",\n",
+    "            \"size_fc\",\n",
+    "            \"dropout\",\n",
+    "            \"size_lstm\",\n",
+    "            \"rec_dropout\",\n",
+    "            \"activation\",\n",
+    "            \"recurrent_activation\",\n",
+    "            \"default_label\",\n",
+    "            \"pad_label\",\n",
+    "        ]\n",
     "        # Make sure the necessary parameters are present and valid.\n",
     "        for param in parameters:\n",
-    "            if param in ['max_length', 'max_char_encoding_id', 'dim_embed',\n",
-    "                         'size_conv']:\n",
-    "                if not isinstance(parameters[param], (int, float)) \\\n",
-    "                        or parameters[param] < 0:\n",
-    "                    errors.append(param + \" must be a valid integer or float \"\n",
-    "                                          \"greater than 0.\")\n",
-    "            elif param in ['dropout', 'rec_dropout']: # additional check for rec_dropout\n",
-    "                if not isinstance(parameters[param], (int, float)) \\\n",
-    "                        or parameters[param] < 0 or parameters[param] > 1:\n",
-    "                    errors.append(param + \" must be a valid integer or float \"\n",
-    "                                          \"from 0 to 1.\")\n",
-    "            elif param == 'size_fc' or param == 'size_lstm': # additional check for size_lstm\n",
-    "                if not isinstance(parameters[param], list) \\\n",
-    "                        or len(parameters[param]) == 0:\n",
-    "                    errors.append(param + \" must be a non-empty list of \"\n",
-    "                                          \"integers.\")\n",
+    "            if param in [\n",
+    "                \"max_length\",\n",
+    "                \"max_char_encoding_id\",\n",
+    "                \"dim_embed\",\n",
+    "                \"size_conv\",\n",
+    "            ]:\n",
+    "                if (\n",
+    "                    not isinstance(parameters[param], (int, float))\n",
+    "                    or parameters[param] < 0\n",
+    "                ):\n",
+    "                    errors.append(\n",
+    "                        param + \" must be a valid integer or float \" \"greater than 0.\"\n",
+    "                    )\n",
+    "            elif param in [\n",
+    "                \"dropout\",\n",
+    "                \"rec_dropout\",\n",
+    "            ]:  # additional check for rec_dropout\n",
+    "                if (\n",
+    "                    not isinstance(parameters[param], (int, float))\n",
+    "                    or parameters[param] < 0\n",
+    "                    or parameters[param] > 1\n",
+    "                ):\n",
+    "                    errors.append(\n",
+    "                        param + \" must be a valid integer or float \" \"from 0 to 1.\"\n",
+    "                    )\n",
+    "            elif (\n",
+    "                param == \"size_fc\" or param == \"size_lstm\"\n",
+    "            ):  # additional check for size_lstm\n",
+    "                if (\n",
+    "                    not isinstance(parameters[param], list)\n",
+    "                    or len(parameters[param]) == 0\n",
+    "                ):\n",
+    "                    errors.append(param + \" must be a non-empty list of \" \"integers.\")\n",
     "                else:\n",
     "                    for item in parameters[param]:\n",
     "                        if not isinstance(item, int):\n",
-    "                            errors.append(param + \" must be a non-empty \"\n",
-    "                                                  \"list of integers.\")\n",
+    "                            errors.append(\n",
+    "                                param + \" must be a non-empty \" \"list of integers.\"\n",
+    "                            )\n",
     "                            break\n",
-    "            elif param in ['default_label', 'activation', 'recurrent_activation']: # additional check for activation and recurrent_activation\n",
+    "            elif param in [\n",
+    "                \"default_label\",\n",
+    "                \"activation\",\n",
+    "                \"recurrent_activation\",\n",
+    "            ]:  # additional check for activation and recurrent_activation\n",
     "                if not isinstance(parameters[param], str):\n",
     "                    error = str(param) + \" must be a string.\"\n",
     "                    errors.append(error)\n",
@@ -194,7 +234,7 @@
     "            if param not in list_of_necessary_params:\n",
     "                errors.append(param + \" is not an accepted parameter.\")\n",
     "        if errors:\n",
-    "            raise ValueError('\\n'.join(errors))\n",
+    "            raise ValueError(\"\\n\".join(errors))\n",
     "\n",
     "    def _construct_model(self):\n",
     "        \"\"\"\n",
@@ -204,41 +244,44 @@
     "        :return: None\n",
     "        \"\"\"\n",
     "        num_labels = self.num_labels\n",
-    "        default_ind = self.label_mapping[self._parameters['default_label']]\n",
+    "        default_ind = self.label_mapping[self._parameters[\"default_label\"]]\n",
     "\n",
     "        # Reset model\n",
     "        tf.keras.backend.clear_session()\n",
     "\n",
     "        # generate glove embedding\n",
-    "        create_glove_char(self._parameters['dim_embed'])\n",
+    "        create_glove_char(self._parameters[\"dim_embed\"])\n",
     "\n",
     "        # generate model\n",
     "        self._model = tf.keras.models.Sequential()\n",
     "\n",
     "        # default parameters\n",
-    "        max_length = self._parameters['max_length']\n",
-    "        max_char_encoding_id = self._parameters['max_char_encoding_id']\n",
+    "        max_length = self._parameters[\"max_length\"]\n",
+    "        max_char_encoding_id = self._parameters[\"max_char_encoding_id\"]\n",
     "\n",
     "        # Encoding layer\n",
     "        def encoding_function(input_str):\n",
     "            char_in_vector = CharacterLevelLstmModel._char_encoding_layer(\n",
-    "                input_str, max_char_encoding_id, max_length)\n",
+    "                input_str, max_char_encoding_id, max_length\n",
+    "            )\n",
     "            return char_in_vector\n",
     "\n",
     "        self._model.add(tf.keras.layers.Input(shape=(None,), dtype=tf.string))\n",
     "\n",
     "        self._model.add(\n",
-    "            tf.keras.layers.Lambda(encoding_function,\n",
-    "                                   output_shape=tuple([max_length])))\n",
+    "            tf.keras.layers.Lambda(encoding_function, output_shape=tuple([max_length]))\n",
+    "        )\n",
     "\n",
     "        # Create a pre-trained weight matrix\n",
     "        # character encoding indices range from 0 to max_char_encoding_id,\n",
     "        # we add one extra index for out-of-vocabulary character\n",
     "        embed_file = os.path.join(\n",
-    "            \"../dataprofiler/labelers\", \"embeddings/glove-reduced-{}D.txt\".format(\n",
-    "                self._parameters['dim_embed']))\n",
-    "        embedding_matrix = np.zeros((max_char_encoding_id + 2,\n",
-    "                                     self._parameters['dim_embed']))\n",
+    "            \"../dataprofiler/labelers\",\n",
+    "            \"embeddings/glove-reduced-{}D.txt\".format(self._parameters[\"dim_embed\"]),\n",
+    "        )\n",
+    "        embedding_matrix = np.zeros(\n",
+    "            (max_char_encoding_id + 2, self._parameters[\"dim_embed\"])\n",
+    "        )\n",
     "        embedding_dict = build_embd_dictionary(embed_file)\n",
     "\n",
     "        input_shape = tuple([max_length])\n",
@@ -247,70 +290,74 @@
     "            if chr(ascii_num) in embedding_dict:\n",
     "                embedding_matrix[ascii_num + 1] = embedding_dict[chr(ascii_num)]\n",
     "\n",
-    "        self._model.add(tf.keras.layers.Embedding(\n",
-    "            max_char_encoding_id + 2,\n",
-    "            self._parameters['dim_embed'],\n",
-    "            weights=[embedding_matrix],\n",
-    "            input_length=input_shape[0],\n",
-    "            trainable=True))\n",
-    "            \n",
+    "        self._model.add(\n",
+    "            tf.keras.layers.Embedding(\n",
+    "                max_char_encoding_id + 2,\n",
+    "                self._parameters[\"dim_embed\"],\n",
+    "                weights=[embedding_matrix],\n",
+    "                input_length=input_shape[0],\n",
+    "                trainable=True,\n",
+    "            )\n",
+    "        )\n",
+    "\n",
     "        # Add the lstm layers\n",
     "        #########################################################\n",
     "        #########################################################\n",
-    "        for size in self._parameters['size_lstm']:\n",
+    "        for size in self._parameters[\"size_lstm\"]:\n",
     "            self._model.add(\n",
-    "                tf.keras.layers.LSTM(units=size, \n",
-    "                                     recurrent_dropout=self._parameters['rec_dropout'], \n",
-    "                                     activation=self._parameters['activation'],\n",
-    "                                     recurrent_activation=self._parameters['recurrent_activation'],\n",
-    "                                     return_sequences=True))\n",
-    "            if self._parameters['dropout']:\n",
-    "                self._model.add(tf.keras.layers.Dropout(self._parameters['dropout']))\n",
+    "                tf.keras.layers.LSTM(\n",
+    "                    units=size,\n",
+    "                    recurrent_dropout=self._parameters[\"rec_dropout\"],\n",
+    "                    activation=self._parameters[\"activation\"],\n",
+    "                    recurrent_activation=self._parameters[\"recurrent_activation\"],\n",
+    "                    return_sequences=True,\n",
+    "                )\n",
+    "            )\n",
+    "            if self._parameters[\"dropout\"]:\n",
+    "                self._model.add(tf.keras.layers.Dropout(self._parameters[\"dropout\"]))\n",
     "        #########################################################\n",
     "        #########################################################\n",
     "\n",
     "        # Add the fully connected layers\n",
-    "        for size in self._parameters['size_fc']:\n",
-    "            self._model.add(\n",
-    "                tf.keras.layers.Dense(units=size, activation='relu'))\n",
-    "            if self._parameters['dropout']:\n",
-    "                self._model.add(\n",
-    "                    tf.keras.layers.Dropout(self._parameters['dropout']))\n",
+    "        for size in self._parameters[\"size_fc\"]:\n",
+    "            self._model.add(tf.keras.layers.Dense(units=size, activation=\"relu\"))\n",
+    "            if self._parameters[\"dropout\"]:\n",
+    "                self._model.add(tf.keras.layers.Dropout(self._parameters[\"dropout\"]))\n",
     "\n",
     "        # Add the final Softmax layer\n",
-    "        self._model.add(\n",
-    "            tf.keras.layers.Dense(num_labels, activation='softmax'))\n",
+    "        self._model.add(tf.keras.layers.Dense(num_labels, activation=\"softmax\"))\n",
     "\n",
     "        # Output the model into a .pb file for TensorFlow\n",
     "        argmax_layer = tf.keras.backend.argmax(self._model.output)\n",
     "\n",
     "        # Create confidence layers\n",
     "        final_predicted_layer = CharacterLevelLstmModel._argmax_threshold_layer(\n",
-    "            num_labels, threshold=0.0, default_ind=default_ind)\n",
+    "            num_labels, threshold=0.0, default_ind=default_ind\n",
+    "        )\n",
     "\n",
-    "        argmax_outputs = self._model.outputs + \\\n",
-    "                         [argmax_layer,\n",
-    "                          final_predicted_layer(argmax_layer, self._model.output)]\n",
+    "        argmax_outputs = self._model.outputs + [\n",
+    "            argmax_layer,\n",
+    "            final_predicted_layer(argmax_layer, self._model.output),\n",
+    "        ]\n",
     "        self._model = tf.keras.Model(self._model.inputs, argmax_outputs)\n",
     "\n",
     "        # Compile the model\n",
-    "        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]\n",
+    "        softmax_output_layer_name = self._model.outputs[0].name.split(\"/\")[0]\n",
     "        losses = {softmax_output_layer_name: \"categorical_crossentropy\"}\n",
     "\n",
     "        # use f1 score metric\n",
-    "        f1_score_training = F1Score(num_classes=num_labels, average='micro')\n",
-    "        metrics = {softmax_output_layer_name: ['acc', f1_score_training]}\n",
+    "        f1_score_training = F1Score(num_classes=num_labels, average=\"micro\")\n",
+    "        metrics = {softmax_output_layer_name: [\"acc\", f1_score_training]}\n",
     "\n",
-    "        self._model.compile(loss=losses,\n",
-    "                            optimizer=\"adam\",\n",
-    "                            metrics=metrics)\n",
+    "        self._model.compile(loss=losses, optimizer=\"adam\", metrics=metrics)\n",
     "\n",
     "        self._epoch_id = 0\n",
     "        self._model_num_labels = num_labels\n",
-    "        self._model_default_ind = default_ind\n"
+    "        self._model_default_ind = default_ind"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "d66bd25c",
    "metadata": {},
@@ -319,6 +366,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "479f407a",
    "metadata": {},
@@ -365,6 +413,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "14b78c69",
    "metadata": {},
@@ -406,6 +455,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "cc60ff8a",
    "metadata": {},

From ed95959e5cf39146359056370af9401f7598d9a1 Mon Sep 17 00:00:00 2001
From: Richard Bann <87214439+drahc1R@users.noreply.github.com>
Date: Thu, 6 Jul 2023 16:36:48 -0400
Subject: [PATCH 3/8] Scipy bug fix (#951)

* update

* renamed var and removed from for loops

* refactored var
---
 dataprofiler/profilers/graph_profiler.py | 16 +++++++++++++++-
 requirements.txt                         |  3 ++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/dataprofiler/profilers/graph_profiler.py b/dataprofiler/profilers/graph_profiler.py
index 123961d88..ecb5d63f6 100644
--- a/dataprofiler/profilers/graph_profiler.py
+++ b/dataprofiler/profilers/graph_profiler.py
@@ -1,6 +1,7 @@
 """Class and functions to calculate and profile properties of graph data."""
 from __future__ import annotations
 
+import importlib
 import pickle
 from collections import defaultdict
 from datetime import datetime
@@ -10,6 +11,7 @@
 import numpy as np
 import pandas as pd
 import scipy.stats as st
+from packaging import version
 
 from ..data_readers.graph_data import GraphData
 from . import utils
@@ -391,6 +393,11 @@ def _get_continuous_distribution(
             st.lognorm,
             st.gamma,
         ]
+
+        scipy_gte_1_11_0 = version.parse(
+            importlib.metadata.version("scipy")
+        ) >= version.parse("1.11.0")
+
         for attribute in attributes:
             if attribute in continuous_attributes:
                 data_as_list = self._attribute_data_as_list(graph, attribute)
@@ -401,7 +408,14 @@ def _get_continuous_distribution(
 
                 for distribution in distribution_candidates:
                     # compute fit, mle, kolmogorov-smirnov test to test fit, and pdf
-                    fit = distribution.fit(df)
+
+                    # scipy 1.11.0 updated the way they handle
+                    # the loc parameter in fit() for lognorm
+                    if distribution == st.lognorm and scipy_gte_1_11_0:
+                        fit = distribution.fit(df, superfit=True)
+
+                    else:
+                        fit = distribution.fit(df)
                     mle = distribution.nnlf(fit, df)
 
                     if mle <= best_mle:
diff --git a/requirements.txt b/requirements.txt
index 994ec78de..8532aaabf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,9 +10,10 @@ fastavro>=1.0.0.post1
 python-snappy>=0.5.4
 charset-normalizer>=1.3.6
 psutil>=4.0.0
-scipy>=1.4.1,<1.11.0
+scipy>=1.4.1
 requests>=2.28.1
 networkx>=2.5.1
 typing-extensions>=3.10.0.2
 HLL>=2.0.3
 datasketches>=4.1.0
+packaging>=23.0

From 34dad6cacfe4a1ac16fdfe3e1e5f41f4dd531f56 Mon Sep 17 00:00:00 2001
From: Junho Lee <53921230+junholee6a@users.noreply.github.com>
Date: Wed, 12 Jul 2023 12:09:18 -0400
Subject: [PATCH 4/8] Make BaseDataProcessor.process() compatible with all
 argument sets (#954)

A method signature that uses *args: Any, **kwargs: Any is compatible
with any set of arguments in mypy, despite being an LSP violation. This
lets us assert that subclasses of BaseDataProcessor should have some
process() method with an arbitrary signature.

We also add to the return type of BaseDataPreprocessor so that it is
inclusive of all of its subclasses.

Co-authored-by: JGSweets <JGSweets@users.noreply.github.com>
---
 dataprofiler/labelers/data_processing.py | 28 +++++++++++++-----------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py
index 4613d05de..53588d949 100644
--- a/dataprofiler/labelers/data_processing.py
+++ b/dataprofiler/labelers/data_processing.py
@@ -129,7 +129,7 @@ def set_params(self, **kwargs: Any) -> None:
             self._parameters[param] = kwargs[param]
 
     @abc.abstractmethod
-    def process(self, *args: Any) -> Any:
+    def process(self, *args: Any, **kwargs: Any) -> Any:
         """Process data."""
         raise NotImplementedError()
 
@@ -169,13 +169,15 @@ def __init__(self, **parameters: Any) -> None:
         super().__init__(**parameters)
 
     @abc.abstractmethod
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
         label_mapping: dict[str, int] | None = None,
         batch_size: int = 32,
-    ) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None]:
+    ) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] | tuple[
+        np.ndarray, np.ndarray
+    ] | np.ndarray:
         """Preprocess data."""
         raise NotImplementedError()
 
@@ -191,7 +193,7 @@ def __init__(self, **parameters: Any) -> None:
         super().__init__(**parameters)
 
     @abc.abstractmethod
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -240,7 +242,7 @@ def help(cls) -> None:
         )
         print(help_str)
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
@@ -668,7 +670,7 @@ def gen_none() -> Generator[None, None, None]:
         if batch_data["samples"]:
             yield batch_data
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
@@ -836,7 +838,7 @@ def _validate_parameters(self, parameters: dict) -> None:
         if errors:
             raise ValueError("\n".join(errors))
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
@@ -1269,7 +1271,7 @@ def match_sentence_lengths(
 
         return results
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -1439,7 +1441,7 @@ def convert_to_unstructured_format(
 
         return text, entities
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         labels: np.ndarray | None = None,
@@ -1800,7 +1802,7 @@ def convert_to_structured_analysis(
 
         return results
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -2022,7 +2024,7 @@ def split_prediction(results: dict) -> None:
                 pred, axis=1, ord=1, keepdims=True
             )
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -2160,7 +2162,7 @@ def _save_processor(self, dirpath: str) -> None:
         ) as fp:
             json.dump(params, fp)
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,
@@ -2253,7 +2255,7 @@ def help(cls) -> None:
         )
         print(help_str)
 
-    def process(  # type: ignore
+    def process(
         self,
         data: np.ndarray,
         results: dict,

From 5125af4ea44e7a5a4cff2218567e4b47950c9b9f Mon Sep 17 00:00:00 2001
From: Junho Lee <53921230+junholee6a@users.noreply.github.com>
Date: Wed, 12 Jul 2023 13:51:19 -0400
Subject: [PATCH 5/8] Fix name mangling and typevar errors (#955)

Inside the BaseDataProcessor class definition, references to
__subclasses are automatically replaced with
_BaseDataProcessor__subclasses. This remains the case even in static
methods _register_subclass() and get_class(). Same with BaseModel and
its __subclasses field. So we do not have to write out the full name
mangled identifiers inside the class definitions.

Also, mypy doesn't seem to be able to handle the return type of
BaseDataProcessor.get_class() being a typevar, so that was changed to
type[BaseDataProcessor]. This does not affect the functionality of
get_class() since it always returns a subclass of BaseDataProcessor.
---
 dataprofiler/labelers/base_model.py      |  6 +++---
 dataprofiler/labelers/data_processing.py | 12 +++++-------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/dataprofiler/labelers/base_model.py b/dataprofiler/labelers/base_model.py
index a4eb0b1d2..032c2ea38 100644
--- a/dataprofiler/labelers/base_model.py
+++ b/dataprofiler/labelers/base_model.py
@@ -32,7 +32,7 @@ def __new__(
 class BaseModel(metaclass=abc.ABCMeta):
     """For labeling data."""
 
-    _BaseModel__subclasses: dict[str, type[BaseModel]] = {}
+    __subclasses: dict[str, type[BaseModel]] = {}
     __metaclass__ = abc.ABCMeta
 
     # boolean if the label mapping requires the mapping for index 0 reserved
@@ -90,7 +90,7 @@ def __eq__(self, other: object) -> bool:
     def _register_subclass(cls) -> None:
         """Register a subclass for the class factory."""
         if not inspect.isabstract(cls):
-            cls._BaseModel__subclasses[cls.__name__.lower()] = cls
+            cls.__subclasses[cls.__name__.lower()] = cls
 
     @property
     def label_mapping(self) -> dict[str, int]:
@@ -156,7 +156,7 @@ def get_class(cls, class_name: str) -> type[BaseModel] | None:
         from .column_name_model import ColumnNameModel  # NOQA
         from .regex_model import RegexModel  # NOQA
 
-        return cls._BaseModel__subclasses.get(class_name.lower(), None)
+        return cls.__subclasses.get(class_name.lower(), None)
 
     def get_parameters(self, param_list: list[str] | None = None) -> dict:
         """
diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py
index 53588d949..bd06a59a4 100644
--- a/dataprofiler/labelers/data_processing.py
+++ b/dataprofiler/labelers/data_processing.py
@@ -49,16 +49,14 @@ def __init__(self, **parameters: Any) -> None:
     def _register_subclass(cls) -> None:
         """Register a subclass for the class factory."""
         if not inspect.isabstract(cls):
-            cls._BaseDataProcessor__subclasses[  # type: ignore
-                cls.__name__.lower()
-            ] = cls
+            cls.__subclasses[cls.__name__.lower()] = cls
 
     @classmethod
-    def get_class(cls: type[Processor], class_name: str) -> type[Processor] | None:
+    def get_class(
+        cls: type[BaseDataProcessor], class_name: str
+    ) -> type[BaseDataProcessor] | None:
         """Get class of BaseDataProcessor object."""
-        return cls._BaseDataProcessor__subclasses.get(  # type: ignore
-            class_name.lower(), None
-        )
+        return cls.__subclasses.get(class_name.lower(), None)
 
     def __eq__(self, other: object) -> bool:
         """

From 92346a308a54f9b509a837c086bd6a9feefc3c56 Mon Sep 17 00:00:00 2001
From: Junho Lee <53921230+junholee6a@users.noreply.github.com>
Date: Thu, 20 Jul 2023 12:29:38 -0400
Subject: [PATCH 6/8] None-check labels dependants (#964)

The mypy errors addressed here occur because variables label_mapping
(in CharPreprocessor), unstructured_labels, and unstructured_label_set
(in StructCharPreprocessor.process()) have optional types when they're
used. This is fixed by checking that they are not None prior to the
operation, which mypy recognizes as removing the None type from them.

This should have no effect on functionality because we are already
checking that labels is not None, and the variables above all depend on
labels such that they are None only if labels is None.
---
 dataprofiler/labelers/data_processing.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py
index bd06a59a4..be1a3fee4 100644
--- a/dataprofiler/labelers/data_processing.py
+++ b/dataprofiler/labelers/data_processing.py
@@ -735,8 +735,8 @@ def process(
             X_train = np.array(
                 [[sentence] for sentence in batch_data["samples"]], dtype=object
             )
-            if labels is not None:
-                num_classes = max(label_mapping.values()) + 1  # type: ignore
+            if labels is not None and label_mapping is not None:
+                num_classes = max(label_mapping.values()) + 1
 
                 Y_train = tf.keras.utils.to_categorical(
                     batch_data["labels"], num_classes
@@ -1503,8 +1503,12 @@ def process(
                 unstructured_label_set,
             ) = self.convert_to_unstructured_format(batch_data, batch_labels)
             unstructured_data[ind] = unstructured_text
-            if labels is not None:
-                unstructured_labels[ind] = unstructured_label_set  # type: ignore
+            if (
+                labels is not None
+                and unstructured_labels is not None
+                and unstructured_label_set is not None
+            ):
+                unstructured_labels[ind] = unstructured_label_set
 
         if labels is not None:
             np_unstruct_labels = np.array(unstructured_labels, dtype="object")

From acb9c5efe49801034db05d527d46c03671da7723 Mon Sep 17 00:00:00 2001
From: clee1152 <chrislee011502@gmail.com>
Date: Mon, 24 Jul 2023 11:22:03 -0400
Subject: [PATCH 7/8] Changed `publish-python-package.yml` to include only
 release branches. (#965)

* Changed release option to only release branches named \'release/<version-tag>\'.

* Reverted types
---
 .github/workflows/publish-python-package.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/publish-python-package.yml b/.github/workflows/publish-python-package.yml
index d2b43419f..3c593d4ed 100644
--- a/.github/workflows/publish-python-package.yml
+++ b/.github/workflows/publish-python-package.yml
@@ -7,6 +7,8 @@ name: Publish Python Package
 on:
   release:
     types: [created]
+    branches:
+      - 'release/*'
 
 jobs:
   deploy:

From 36fc74a1b89c19ac61afb8bd8eb0da4bb7536b56 Mon Sep 17 00:00:00 2001
From: Junho Lee <junholee6a@gmail.com>
Date: Mon, 24 Jul 2023 21:28:17 -0400
Subject: [PATCH 8/8] Delay transforming priority_order into ndarray

In the changed code, we had a mypy error because numpy ndarrays are not
compatible with random.Random.shuffle() (expected argument type is
MutableSequence[Any])

We fix this by first instantiating priority_order as a list, then
shuffling it, then creating an ndarray from it afterwards.
---
 dataprofiler/labelers/data_processing.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py
index be1a3fee4..fabb7a08b 100644
--- a/dataprofiler/labelers/data_processing.py
+++ b/dataprofiler/labelers/data_processing.py
@@ -2047,9 +2047,9 @@ def process(
         elif aggregation_func == "random":
             num_labels = max(label_mapping.values()) + 1
             random_state: random.Random = self._parameters["random_state"]
-            priority_order = np.array(list(range(num_labels)))
-            random_state.shuffle(priority_order)  # type: ignore
-            self.priority_prediction(results, priority_order)
+            priority_order = list(range(num_labels))
+            random_state.shuffle(priority_order)
+            self.priority_prediction(results, np.array(priority_order))
         else:
             raise ValueError(
                 f"`{aggregation_func}` is not a valid aggregation function"