From b63b25d1a3cd37c308bd1cbd93048294fcb2fe62 Mon Sep 17 00:00:00 2001 From: Liz Smith Date: Thu, 6 Jul 2023 13:54:59 -0500 Subject: [PATCH 1/8] let's try this again (#953) --- .../tests/profilers/test_profile_builder.py | 110 +++++++++++++----- 1 file changed, 82 insertions(+), 28 deletions(-) diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index 7a309e946..a4326492f 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -327,7 +327,12 @@ def test_correlation(self, *mock): # sum((x - np.mean(x))*(y-np.mean(y))) / # np.sqrt(sum((x - np.mean(x)**2)))/np.sqrt(sum((y - np.mean(y)**2))) profile_options = dp.ProfilerOptions() - profile_options.set({"correlation.is_enabled": True}) + profile_options.set( + { + "correlation.is_enabled": True, + "structured_options.multiprocess.is_enabled": False, + } + ) # data with a sole numeric column data = pd.DataFrame([1.0, 8.0, 1.0, -2.0, 5.0]) @@ -580,7 +585,12 @@ def test_merge_correlation(self, *mocks): def test_correlation_update(self): profile_options = dp.ProfilerOptions() - profile_options.set({"correlation.is_enabled": True}) + profile_options.set( + { + "correlation.is_enabled": True, + "structured_options.multiprocess.is_enabled": False, + } + ) # Test with all numeric columns data = pd.DataFrame( @@ -776,12 +786,14 @@ def test_correlation_selected_columns(self, *mocks): def test_chi2(self, *mocks): # Empty data = pd.DataFrame([]) - profiler = dp.StructuredProfiler(data) + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) + profiler = dp.StructuredProfiler(data, options=profile_options) self.assertIsNone(profiler.chi2_matrix) # Single column data = pd.DataFrame({"a": ["y", "y", "n", "n", "y"]}) - profiler = dp.StructuredProfiler(data) + profiler = dp.StructuredProfiler(data, options=profile_options) expected_mat = np.array([1]) self.assertEqual(expected_mat, profiler.chi2_matrix) @@ -793,7 +805,7 @@ def test_chi2(self, *mocks): } ) - profiler = dp.StructuredProfiler(data) + profiler = dp.StructuredProfiler(data, options=profile_options) expected_mat = np.array( [[1, 0.309924, 0.404638], [0.309924, 1, 0.548812], [0.404638, 0.548812, 1]] ) @@ -808,7 +820,7 @@ def test_chi2(self, *mocks): } ) - profiler = dp.StructuredProfiler(data) + profiler = dp.StructuredProfiler(data, options=profile_options) expected_mat = np.array( [[1, 0.007295, 0.007295], [0.007295, 1, 0.015609], [0.007295, 0.015609, 1]] ) @@ -823,7 +835,7 @@ def test_chi2(self, *mocks): } ) - profiler = dp.StructuredProfiler(data) + profiler = dp.StructuredProfiler(data, options=profile_options) expected_mat = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]) np.testing.assert_array_almost_equal(expected_mat, profiler.chi2_matrix) @@ -840,8 +852,10 @@ def test_merge_chi2(self, *mocks): "c": ["n", "maybe", "n", "n", "n", "y", "y"], } ) - profiler1 = dp.StructuredProfiler(None) - profiler2 = dp.StructuredProfiler(data) + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) + profiler1 = dp.StructuredProfiler(None, options=profile_options) + profiler2 = dp.StructuredProfiler(data, options=profile_options) with mock.patch( "dataprofiler.profilers.profile_builder." "StructuredProfiler._add_error_checks" @@ -862,8 +876,8 @@ def test_merge_chi2(self, *mocks): data1 = data[:4] data2 = data[4:] - profiler1 = dp.StructuredProfiler(data1) - profiler2 = dp.StructuredProfiler(data2) + profiler1 = dp.StructuredProfiler(data1, options=profile_options) + profiler2 = dp.StructuredProfiler(data2, options=profile_options) profiler3 = profiler1 + profiler2 expected_mat = np.array( [[1, 0.309924, 0.404638], [0.309924, 1, 0.548812], [0.404638, 0.548812, 1]] @@ -880,8 +894,8 @@ def test_merge_chi2(self, *mocks): ) data1 = data[:4] data2 = data[4:] - profiler1 = dp.StructuredProfiler(data1) - profiler2 = dp.StructuredProfiler(data2) + profiler1 = dp.StructuredProfiler(data1, options=profile_options) + profiler2 = dp.StructuredProfiler(data2, options=profile_options) profiler3 = profiler1 + profiler2 expected_mat = np.array( [[1, 0.007295, 0.007295], [0.007295, 1, 0.015609], [0.007295, 0.015609, 1]] @@ -918,7 +932,9 @@ def test_update_chi2(self, *mocks): } ) data2 = pd.DataFrame({"a": [], "b": [], "c": []}) - profiler = dp.StructuredProfiler(data1) + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) + profiler = dp.StructuredProfiler(data1, options=profile_options) profiler.update_profile(data2) expected_mat = np.array( [[1, 0.309924, 0.404638], [0.309924, 1, 0.548812], [0.404638, 0.548812, 1]] @@ -934,7 +950,7 @@ def test_update_chi2(self, *mocks): ) data1 = data[:4] data2 = data[4:] - profiler = dp.StructuredProfiler(data1) + profiler = dp.StructuredProfiler(data1, options=profile_options) profiler.update_profile(data2) expected_mat = np.array( [[1, 0.309924, 0.404638], [0.309924, 1, 0.548812], [0.404638, 0.548812, 1]] @@ -952,7 +968,7 @@ def test_update_chi2(self, *mocks): data1 = data[:4] data2 = data[4:] - profiler = dp.StructuredProfiler(data1) + profiler = dp.StructuredProfiler(data1, options=profile_options) profiler.update_profile(data2) expected_mat = np.array( [[1, 0.007295, 0.007295], [0.007295, 1, 0.015609], [0.007295, 0.015609, 1]] @@ -969,7 +985,7 @@ def test_update_chi2(self, *mocks): ) data1 = data[:4] data2 = data[4:] - profiler = dp.StructuredProfiler(data1) + profiler = dp.StructuredProfiler(data1, options=profile_options) profiler.update_profile(data2) expected_mat = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]) np.testing.assert_array_almost_equal(expected_mat, profiler.chi2_matrix) @@ -1203,7 +1219,12 @@ def test_report_remove_disabled_flag(self): # with options to disable FloatColumn `precision` # and with remove_disabled_flag == True profiler_options = ProfilerOptions() - profiler_options.set({"precision.is_enabled": False}) + profiler_options.set( + { + "precision.is_enabled": False, + "structured_options.multiprocess.is_enabled": False, + } + ) profiler = dp.StructuredProfiler(data=data, options=profiler_options) report = profiler.report(report_options={"remove_disabled_flag": True}) @@ -1215,7 +1236,12 @@ def test_report_remove_disabled_flag(self): # with options to disable NumericalMixIn cal `min` # and with remove_disabled_flag == True profiler_options = ProfilerOptions() - profiler_options.set({"min.is_enabled": False}) + profiler_options.set( + { + "min.is_enabled": False, + "structured_options.multiprocess.is_enabled": False, + } + ) profiler = dp.StructuredProfiler(data=data, options=profiler_options) report = profiler.report(report_options={"remove_disabled_flag": True}) @@ -1225,7 +1251,12 @@ def test_report_remove_disabled_flag(self): # with options to disable TextColumn cal `vocab` # and with remove_disabled_flag == True profiler_options = ProfilerOptions() - profiler_options.set({"vocab.is_enabled": False}) + profiler_options.set( + { + "vocab.is_enabled": False, + "structured_options.multiprocess.is_enabled": False, + } + ) profiler = dp.StructuredProfiler(data=data, options=profiler_options) report = profiler.report(report_options={"remove_disabled_flag": True}) @@ -1234,7 +1265,12 @@ def test_report_remove_disabled_flag(self): # with profiler options and default remove_disabled_flag profiler_options = ProfilerOptions() - profiler_options.set({"min.is_enabled": False}) + profiler_options.set( + { + "min.is_enabled": False, + "structured_options.multiprocess.is_enabled": False, + } + ) profiler = dp.StructuredProfiler(data=data, options=profiler_options) report = profiler.report() @@ -1242,7 +1278,9 @@ def test_report_remove_disabled_flag(self): self.assertIn("min", report["data_stats"][iter_value]["statistics"]) # w/o profiler options and default remove_disabled_flag - profiler = dp.StructuredProfiler(data=data) + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) + profiler = dp.StructuredProfiler(data=data, options=profiler_options) report = profiler.report() for iter_value in range(0, len(data.columns) - 1): @@ -1370,7 +1408,11 @@ def recursive_test_helper(report, prev_key=None): def test_data_label_assigned(self): # only use 5 samples - trained_schema = dp.StructuredProfiler(self.aws_dataset, samples_per_update=5) + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) + trained_schema = dp.StructuredProfiler( + self.aws_dataset, samples_per_update=5, options=profile_options + ) report = trained_schema.report() has_non_null_column = False for i in range(len(report["data_stats"])): @@ -1754,7 +1796,10 @@ def test_duplicate_columns(self): [[1, 2, 3, 4, 5, 6], [10, 20, 30, 40, 50, 60]], columns=["a", "b", "a", "b", "c", "d"], ) - profiler = dp.StructuredProfiler(data) + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) + + profiler = dp.StructuredProfiler(data, options=profile_options) # Ensure columns are correctly allocated to profiles in list expected_mapping = {"a": [0, 2], "b": [1, 3], "c": [4], "d": [5]} @@ -1812,9 +1857,11 @@ def test_unique_col_permutation(self, *mocks): perm_data = pd.DataFrame( [[4, 3, 2, 1], [8, 7, 6, 5]], columns=["d", "c", "b", "a"] ) + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) # Test via add - first_profiler = dp.StructuredProfiler(data) + first_profiler = dp.StructuredProfiler(data, options=profile_options) perm_profiler = dp.StructuredProfiler(perm_data) profiler = first_profiler + perm_profiler @@ -1834,7 +1881,7 @@ def test_unique_col_permutation(self, *mocks): ) # Test via update - profiler = dp.StructuredProfiler(data) + profiler = dp.StructuredProfiler(data, options=profile_options) profiler.update_profile(perm_data) for col_idx in range(len(profiler._profile)): @@ -4047,11 +4094,13 @@ def test_report_remove_disabled_flag(self): def test_save_and_load_pkl_file(self): data_folder = "dataprofiler/tests/data/" test_files = ["txt/code.txt", "txt/sentence-10x.txt"] + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) for test_file in test_files: # Create Data and StructuredProfiler objects data = dp.Data(os.path.join(data_folder, test_file)) - save_profile = UnstructuredProfiler(data) + save_profile = UnstructuredProfiler(data, options=profile_options) # If profile _empty_line_count = 0, it won't test if the variable is # saved correctly since that is also the default value. Ensure @@ -4112,7 +4161,12 @@ def test_save_and_load_no_labeler(self): data = "this is my test data: 123-456-7890" profile_options = dp.ProfilerOptions() - profile_options.set({"data_labeler.is_enabled": False}) + profile_options.set( + { + "data_labeler.is_enabled": False, + "structured_options.multiprocess.is_enabled": False, + } + ) save_profile = dp.UnstructuredProfiler(data, options=profile_options) From ad6ab5849d6023b16147dc5246d9bc5468764eed Mon Sep 17 00:00:00 2001 From: Navid Nafiuzzaman Date: Thu, 6 Jul 2023 15:27:08 -0400 Subject: [PATCH 2/8] Fix/f1 score path fix import (#952) * Fixed F1Score Import * Linted example file with Black Linter --- examples/add_new_model_to_data_labeler.ipynb | 220 ++++++++++++------- 1 file changed, 135 insertions(+), 85 deletions(-) diff --git a/examples/add_new_model_to_data_labeler.ipynb b/examples/add_new_model_to_data_labeler.ipynb index 3f59297bc..1495e6a85 100644 --- a/examples/add_new_model_to_data_labeler.ipynb +++ b/examples/add_new_model_to_data_labeler.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "228bb2a6", "metadata": {}, @@ -9,6 +10,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cab7a569", "metadata": {}, @@ -39,6 +41,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "e90728ab", "metadata": {}, @@ -47,6 +50,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3d61981c", "metadata": {}, @@ -75,6 +79,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "745ed0d4", "metadata": {}, @@ -83,6 +88,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7375b0c0", "metadata": {}, @@ -105,15 +111,19 @@ "source": [ "import tensorflow as tf\n", "import numpy as np\n", - "from dataprofiler.labelers.character_level_cnn_model import CharacterLevelCnnModel, F1Score, \\\n", - " create_glove_char, build_embd_dictionary\n", + "from dataprofiler.labelers.character_level_cnn_model import (\n", + " CharacterLevelCnnModel,\n", + " create_glove_char,\n", + " build_embd_dictionary,\n", + ")\n", "from dataprofiler.labelers.base_model import BaseModel\n", + "from dataprofiler.labelers.labeler_utils import F1Score\n", + "\n", "\n", "# CharacterLevelLstmModel derives from CharacterLevelCnnModel\n", "#########################################################\n", "#########################################################\n", "class CharacterLevelLstmModel(CharacterLevelCnnModel):\n", - "\n", " # boolean if the label mapping requires the mapping for index 0 reserved\n", " requires_zero_mapping = True\n", "\n", @@ -121,26 +131,26 @@ " \"\"\"\n", " LSTM Model Initializer\n", " \"\"\"\n", - " \n", + "\n", " # parameter initialization\n", " if not parameters:\n", " parameters = {}\n", - " parameters.setdefault('max_length', 3400)\n", - " parameters.setdefault('max_char_encoding_id', 127)\n", - " parameters.setdefault('dim_embed', 64)\n", - " parameters.setdefault('size_fc', [32, 32])\n", - " parameters.setdefault('dropout', 0.1)\n", + " parameters.setdefault(\"max_length\", 3400)\n", + " parameters.setdefault(\"max_char_encoding_id\", 127)\n", + " parameters.setdefault(\"dim_embed\", 64)\n", + " parameters.setdefault(\"size_fc\", [32, 32])\n", + " parameters.setdefault(\"dropout\", 0.1)\n", " # new parameters for LSTM model\n", " #########################################################\n", " #########################################################\n", - " parameters.setdefault('size_lstm', [64])\n", - " parameters.setdefault('rec_dropout', 0.1)\n", - " parameters.setdefault('activation', \"tanh\")\n", - " parameters.setdefault('recurrent_activation', \"sigmoid\")\n", + " parameters.setdefault(\"size_lstm\", [64])\n", + " parameters.setdefault(\"rec_dropout\", 0.1)\n", + " parameters.setdefault(\"activation\", \"tanh\")\n", + " parameters.setdefault(\"recurrent_activation\", \"sigmoid\")\n", " #########################################################\n", " #########################################################\n", - " parameters.setdefault('default_label', \"UNKNOWN\")\n", - " parameters['pad_label'] = 'PAD'\n", + " parameters.setdefault(\"default_label\", \"UNKNOWN\")\n", + " parameters[\"pad_label\"] = \"PAD\"\n", " self._epoch_id = 0\n", "\n", " # reconstruct flags for model\n", @@ -155,36 +165,66 @@ " present.\n", " \"\"\"\n", " errors = []\n", - " list_of_necessary_params = ['max_length', 'max_char_encoding_id',\n", - " 'dim_embed', 'size_fc', 'dropout',\n", - " 'size_lstm', 'rec_dropout', 'activation', \n", - " 'recurrent_activation', 'default_label', \n", - " 'pad_label']\n", + " list_of_necessary_params = [\n", + " \"max_length\",\n", + " \"max_char_encoding_id\",\n", + " \"dim_embed\",\n", + " \"size_fc\",\n", + " \"dropout\",\n", + " \"size_lstm\",\n", + " \"rec_dropout\",\n", + " \"activation\",\n", + " \"recurrent_activation\",\n", + " \"default_label\",\n", + " \"pad_label\",\n", + " ]\n", " # Make sure the necessary parameters are present and valid.\n", " for param in parameters:\n", - " if param in ['max_length', 'max_char_encoding_id', 'dim_embed',\n", - " 'size_conv']:\n", - " if not isinstance(parameters[param], (int, float)) \\\n", - " or parameters[param] < 0:\n", - " errors.append(param + \" must be a valid integer or float \"\n", - " \"greater than 0.\")\n", - " elif param in ['dropout', 'rec_dropout']: # additional check for rec_dropout\n", - " if not isinstance(parameters[param], (int, float)) \\\n", - " or parameters[param] < 0 or parameters[param] > 1:\n", - " errors.append(param + \" must be a valid integer or float \"\n", - " \"from 0 to 1.\")\n", - " elif param == 'size_fc' or param == 'size_lstm': # additional check for size_lstm\n", - " if not isinstance(parameters[param], list) \\\n", - " or len(parameters[param]) == 0:\n", - " errors.append(param + \" must be a non-empty list of \"\n", - " \"integers.\")\n", + " if param in [\n", + " \"max_length\",\n", + " \"max_char_encoding_id\",\n", + " \"dim_embed\",\n", + " \"size_conv\",\n", + " ]:\n", + " if (\n", + " not isinstance(parameters[param], (int, float))\n", + " or parameters[param] < 0\n", + " ):\n", + " errors.append(\n", + " param + \" must be a valid integer or float \" \"greater than 0.\"\n", + " )\n", + " elif param in [\n", + " \"dropout\",\n", + " \"rec_dropout\",\n", + " ]: # additional check for rec_dropout\n", + " if (\n", + " not isinstance(parameters[param], (int, float))\n", + " or parameters[param] < 0\n", + " or parameters[param] > 1\n", + " ):\n", + " errors.append(\n", + " param + \" must be a valid integer or float \" \"from 0 to 1.\"\n", + " )\n", + " elif (\n", + " param == \"size_fc\" or param == \"size_lstm\"\n", + " ): # additional check for size_lstm\n", + " if (\n", + " not isinstance(parameters[param], list)\n", + " or len(parameters[param]) == 0\n", + " ):\n", + " errors.append(param + \" must be a non-empty list of \" \"integers.\")\n", " else:\n", " for item in parameters[param]:\n", " if not isinstance(item, int):\n", - " errors.append(param + \" must be a non-empty \"\n", - " \"list of integers.\")\n", + " errors.append(\n", + " param + \" must be a non-empty \" \"list of integers.\"\n", + " )\n", " break\n", - " elif param in ['default_label', 'activation', 'recurrent_activation']: # additional check for activation and recurrent_activation\n", + " elif param in [\n", + " \"default_label\",\n", + " \"activation\",\n", + " \"recurrent_activation\",\n", + " ]: # additional check for activation and recurrent_activation\n", " if not isinstance(parameters[param], str):\n", " error = str(param) + \" must be a string.\"\n", " errors.append(error)\n", @@ -194,7 +234,7 @@ " if param not in list_of_necessary_params:\n", " errors.append(param + \" is not an accepted parameter.\")\n", " if errors:\n", - " raise ValueError('\\n'.join(errors))\n", + " raise ValueError(\"\\n\".join(errors))\n", "\n", " def _construct_model(self):\n", " \"\"\"\n", @@ -204,41 +244,44 @@ " :return: None\n", " \"\"\"\n", " num_labels = self.num_labels\n", - " default_ind = self.label_mapping[self._parameters['default_label']]\n", + " default_ind = self.label_mapping[self._parameters[\"default_label\"]]\n", "\n", " # Reset model\n", " tf.keras.backend.clear_session()\n", "\n", " # generate glove embedding\n", - " create_glove_char(self._parameters['dim_embed'])\n", + " create_glove_char(self._parameters[\"dim_embed\"])\n", "\n", " # generate model\n", " self._model = tf.keras.models.Sequential()\n", "\n", " # default parameters\n", - " max_length = self._parameters['max_length']\n", - " max_char_encoding_id = self._parameters['max_char_encoding_id']\n", + " max_length = self._parameters[\"max_length\"]\n", + " max_char_encoding_id = self._parameters[\"max_char_encoding_id\"]\n", "\n", " # Encoding layer\n", " def encoding_function(input_str):\n", " char_in_vector = CharacterLevelLstmModel._char_encoding_layer(\n", - " input_str, max_char_encoding_id, max_length)\n", + " input_str, max_char_encoding_id, max_length\n", + " )\n", " return char_in_vector\n", "\n", " self._model.add(tf.keras.layers.Input(shape=(None,), dtype=tf.string))\n", "\n", " self._model.add(\n", - " tf.keras.layers.Lambda(encoding_function,\n", - " output_shape=tuple([max_length])))\n", + " tf.keras.layers.Lambda(encoding_function, output_shape=tuple([max_length]))\n", + " )\n", "\n", " # Create a pre-trained weight matrix\n", " # character encoding indices range from 0 to max_char_encoding_id,\n", " # we add one extra index for out-of-vocabulary character\n", " embed_file = os.path.join(\n", - " \"../dataprofiler/labelers\", \"embeddings/glove-reduced-{}D.txt\".format(\n", - " self._parameters['dim_embed']))\n", - " embedding_matrix = np.zeros((max_char_encoding_id + 2,\n", - " self._parameters['dim_embed']))\n", + " \"../dataprofiler/labelers\",\n", + " \"embeddings/glove-reduced-{}D.txt\".format(self._parameters[\"dim_embed\"]),\n", + " )\n", + " embedding_matrix = np.zeros(\n", + " (max_char_encoding_id + 2, self._parameters[\"dim_embed\"])\n", + " )\n", " embedding_dict = build_embd_dictionary(embed_file)\n", "\n", " input_shape = tuple([max_length])\n", @@ -247,70 +290,74 @@ " if chr(ascii_num) in embedding_dict:\n", " embedding_matrix[ascii_num + 1] = embedding_dict[chr(ascii_num)]\n", "\n", - " self._model.add(tf.keras.layers.Embedding(\n", - " max_char_encoding_id + 2,\n", - " self._parameters['dim_embed'],\n", - " weights=[embedding_matrix],\n", - " input_length=input_shape[0],\n", - " trainable=True))\n", - " \n", + " self._model.add(\n", + " tf.keras.layers.Embedding(\n", + " max_char_encoding_id + 2,\n", + " self._parameters[\"dim_embed\"],\n", + " weights=[embedding_matrix],\n", + " input_length=input_shape[0],\n", + " trainable=True,\n", + " )\n", + " )\n", + "\n", " # Add the lstm layers\n", " #########################################################\n", " #########################################################\n", - " for size in self._parameters['size_lstm']:\n", + " for size in self._parameters[\"size_lstm\"]:\n", " self._model.add(\n", - " tf.keras.layers.LSTM(units=size, \n", - " recurrent_dropout=self._parameters['rec_dropout'], \n", - " activation=self._parameters['activation'],\n", - " recurrent_activation=self._parameters['recurrent_activation'],\n", - " return_sequences=True))\n", - " if self._parameters['dropout']:\n", - " self._model.add(tf.keras.layers.Dropout(self._parameters['dropout']))\n", + " tf.keras.layers.LSTM(\n", + " units=size,\n", + " recurrent_dropout=self._parameters[\"rec_dropout\"],\n", + " activation=self._parameters[\"activation\"],\n", + " recurrent_activation=self._parameters[\"recurrent_activation\"],\n", + " return_sequences=True,\n", + " )\n", + " )\n", + " if self._parameters[\"dropout\"]:\n", + " self._model.add(tf.keras.layers.Dropout(self._parameters[\"dropout\"]))\n", " #########################################################\n", " #########################################################\n", "\n", " # Add the fully connected layers\n", - " for size in self._parameters['size_fc']:\n", - " self._model.add(\n", - " tf.keras.layers.Dense(units=size, activation='relu'))\n", - " if self._parameters['dropout']:\n", - " self._model.add(\n", - " tf.keras.layers.Dropout(self._parameters['dropout']))\n", + " for size in self._parameters[\"size_fc\"]:\n", + " self._model.add(tf.keras.layers.Dense(units=size, activation=\"relu\"))\n", + " if self._parameters[\"dropout\"]:\n", + " self._model.add(tf.keras.layers.Dropout(self._parameters[\"dropout\"]))\n", "\n", " # Add the final Softmax layer\n", - " self._model.add(\n", - " tf.keras.layers.Dense(num_labels, activation='softmax'))\n", + " self._model.add(tf.keras.layers.Dense(num_labels, activation=\"softmax\"))\n", "\n", " # Output the model into a .pb file for TensorFlow\n", " argmax_layer = tf.keras.backend.argmax(self._model.output)\n", "\n", " # Create confidence layers\n", " final_predicted_layer = CharacterLevelLstmModel._argmax_threshold_layer(\n", - " num_labels, threshold=0.0, default_ind=default_ind)\n", + " num_labels, threshold=0.0, default_ind=default_ind\n", + " )\n", "\n", - " argmax_outputs = self._model.outputs + \\\n", - " [argmax_layer,\n", - " final_predicted_layer(argmax_layer, self._model.output)]\n", + " argmax_outputs = self._model.outputs + [\n", + " argmax_layer,\n", + " final_predicted_layer(argmax_layer, self._model.output),\n", + " ]\n", " self._model = tf.keras.Model(self._model.inputs, argmax_outputs)\n", "\n", " # Compile the model\n", - " softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]\n", + " softmax_output_layer_name = self._model.outputs[0].name.split(\"/\")[0]\n", " losses = {softmax_output_layer_name: \"categorical_crossentropy\"}\n", "\n", " # use f1 score metric\n", - " f1_score_training = F1Score(num_classes=num_labels, average='micro')\n", - " metrics = {softmax_output_layer_name: ['acc', f1_score_training]}\n", + " f1_score_training = F1Score(num_classes=num_labels, average=\"micro\")\n", + " metrics = {softmax_output_layer_name: [\"acc\", f1_score_training]}\n", "\n", - " self._model.compile(loss=losses,\n", - " optimizer=\"adam\",\n", - " metrics=metrics)\n", + " self._model.compile(loss=losses, optimizer=\"adam\", metrics=metrics)\n", "\n", " self._epoch_id = 0\n", " self._model_num_labels = num_labels\n", - " self._model_default_ind = default_ind\n" + " self._model_default_ind = default_ind" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "d66bd25c", "metadata": {}, @@ -319,6 +366,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "479f407a", "metadata": {}, @@ -365,6 +413,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "14b78c69", "metadata": {}, @@ -406,6 +455,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cc60ff8a", "metadata": {}, From ed95959e5cf39146359056370af9401f7598d9a1 Mon Sep 17 00:00:00 2001 From: Richard Bann <87214439+drahc1R@users.noreply.github.com> Date: Thu, 6 Jul 2023 16:36:48 -0400 Subject: [PATCH 3/8] Scipy bug fix (#951) * update * renamed var and removed from for loops * refactored var --- dataprofiler/profilers/graph_profiler.py | 16 +++++++++++++++- requirements.txt | 3 ++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/dataprofiler/profilers/graph_profiler.py b/dataprofiler/profilers/graph_profiler.py index 123961d88..ecb5d63f6 100644 --- a/dataprofiler/profilers/graph_profiler.py +++ b/dataprofiler/profilers/graph_profiler.py @@ -1,6 +1,7 @@ """Class and functions to calculate and profile properties of graph data.""" from __future__ import annotations +import importlib import pickle from collections import defaultdict from datetime import datetime @@ -10,6 +11,7 @@ import numpy as np import pandas as pd import scipy.stats as st +from packaging import version from ..data_readers.graph_data import GraphData from . import utils @@ -391,6 +393,11 @@ def _get_continuous_distribution( st.lognorm, st.gamma, ] + + scipy_gte_1_11_0 = version.parse( + importlib.metadata.version("scipy") + ) >= version.parse("1.11.0") + for attribute in attributes: if attribute in continuous_attributes: data_as_list = self._attribute_data_as_list(graph, attribute) @@ -401,7 +408,14 @@ def _get_continuous_distribution( for distribution in distribution_candidates: # compute fit, mle, kolmogorov-smirnov test to test fit, and pdf - fit = distribution.fit(df) + + # scipy 1.11.0 updated the way they handle + # the loc parameter in fit() for lognorm + if distribution == st.lognorm and scipy_gte_1_11_0: + fit = distribution.fit(df, superfit=True) + + else: + fit = distribution.fit(df) mle = distribution.nnlf(fit, df) if mle <= best_mle: diff --git a/requirements.txt b/requirements.txt index 994ec78de..8532aaabf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,9 +10,10 @@ fastavro>=1.0.0.post1 python-snappy>=0.5.4 charset-normalizer>=1.3.6 psutil>=4.0.0 -scipy>=1.4.1,<1.11.0 +scipy>=1.4.1 requests>=2.28.1 networkx>=2.5.1 typing-extensions>=3.10.0.2 HLL>=2.0.3 datasketches>=4.1.0 +packaging>=23.0 From 34dad6cacfe4a1ac16fdfe3e1e5f41f4dd531f56 Mon Sep 17 00:00:00 2001 From: Junho Lee <53921230+junholee6a@users.noreply.github.com> Date: Wed, 12 Jul 2023 12:09:18 -0400 Subject: [PATCH 4/8] Make BaseDataProcessor.process() compatible with all argument sets (#954) A method signature that uses *args: Any, **kwargs: Any is compatible with any set of arguments in mypy, despite being an LSP violation. This lets us assert that subclasses of BaseDataProcessor should have some process() method with an arbitrary signature. We also add to the return type of BaseDataPreprocessor so that it is inclusive of all of its subclasses. Co-authored-by: JGSweets --- dataprofiler/labelers/data_processing.py | 28 +++++++++++++----------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index 4613d05de..53588d949 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -129,7 +129,7 @@ def set_params(self, **kwargs: Any) -> None: self._parameters[param] = kwargs[param] @abc.abstractmethod - def process(self, *args: Any) -> Any: + def process(self, *args: Any, **kwargs: Any) -> Any: """Process data.""" raise NotImplementedError() @@ -169,13 +169,15 @@ def __init__(self, **parameters: Any) -> None: super().__init__(**parameters) @abc.abstractmethod - def process( # type: ignore + def process( self, data: np.ndarray, labels: np.ndarray | None = None, label_mapping: dict[str, int] | None = None, batch_size: int = 32, - ) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None]: + ) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] | tuple[ + np.ndarray, np.ndarray + ] | np.ndarray: """Preprocess data.""" raise NotImplementedError() @@ -191,7 +193,7 @@ def __init__(self, **parameters: Any) -> None: super().__init__(**parameters) @abc.abstractmethod - def process( # type: ignore + def process( self, data: np.ndarray, results: dict, @@ -240,7 +242,7 @@ def help(cls) -> None: ) print(help_str) - def process( # type: ignore + def process( self, data: np.ndarray, labels: np.ndarray | None = None, @@ -668,7 +670,7 @@ def gen_none() -> Generator[None, None, None]: if batch_data["samples"]: yield batch_data - def process( # type: ignore + def process( self, data: np.ndarray, labels: np.ndarray | None = None, @@ -836,7 +838,7 @@ def _validate_parameters(self, parameters: dict) -> None: if errors: raise ValueError("\n".join(errors)) - def process( # type: ignore + def process( self, data: np.ndarray, labels: np.ndarray | None = None, @@ -1269,7 +1271,7 @@ def match_sentence_lengths( return results - def process( # type: ignore + def process( self, data: np.ndarray, results: dict, @@ -1439,7 +1441,7 @@ def convert_to_unstructured_format( return text, entities - def process( # type: ignore + def process( self, data: np.ndarray, labels: np.ndarray | None = None, @@ -1800,7 +1802,7 @@ def convert_to_structured_analysis( return results - def process( # type: ignore + def process( self, data: np.ndarray, results: dict, @@ -2022,7 +2024,7 @@ def split_prediction(results: dict) -> None: pred, axis=1, ord=1, keepdims=True ) - def process( # type: ignore + def process( self, data: np.ndarray, results: dict, @@ -2160,7 +2162,7 @@ def _save_processor(self, dirpath: str) -> None: ) as fp: json.dump(params, fp) - def process( # type: ignore + def process( self, data: np.ndarray, results: dict, @@ -2253,7 +2255,7 @@ def help(cls) -> None: ) print(help_str) - def process( # type: ignore + def process( self, data: np.ndarray, results: dict, From 5125af4ea44e7a5a4cff2218567e4b47950c9b9f Mon Sep 17 00:00:00 2001 From: Junho Lee <53921230+junholee6a@users.noreply.github.com> Date: Wed, 12 Jul 2023 13:51:19 -0400 Subject: [PATCH 5/8] Fix name mangling and typevar errors (#955) Inside the BaseDataProcessor class definition, references to __subclasses are automatically replaced with _BaseDataProcessor__subclasses. This remains the case even in static methods _register_subclass() and get_class(). Same with BaseModel and its __subclasses field. So we do not have to write out the full name mangled identifiers inside the class definitions. Also, mypy doesn't seem to be able to handle the return type of BaseDataProcessor.get_class() being a typevar, so that was changed to type[BaseDataProcessor]. This does not affect the functionality of get_class() since it always returns a subclass of BaseDataProcessor. --- dataprofiler/labelers/base_model.py | 6 +++--- dataprofiler/labelers/data_processing.py | 12 +++++------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/dataprofiler/labelers/base_model.py b/dataprofiler/labelers/base_model.py index a4eb0b1d2..032c2ea38 100644 --- a/dataprofiler/labelers/base_model.py +++ b/dataprofiler/labelers/base_model.py @@ -32,7 +32,7 @@ def __new__( class BaseModel(metaclass=abc.ABCMeta): """For labeling data.""" - _BaseModel__subclasses: dict[str, type[BaseModel]] = {} + __subclasses: dict[str, type[BaseModel]] = {} __metaclass__ = abc.ABCMeta # boolean if the label mapping requires the mapping for index 0 reserved @@ -90,7 +90,7 @@ def __eq__(self, other: object) -> bool: def _register_subclass(cls) -> None: """Register a subclass for the class factory.""" if not inspect.isabstract(cls): - cls._BaseModel__subclasses[cls.__name__.lower()] = cls + cls.__subclasses[cls.__name__.lower()] = cls @property def label_mapping(self) -> dict[str, int]: @@ -156,7 +156,7 @@ def get_class(cls, class_name: str) -> type[BaseModel] | None: from .column_name_model import ColumnNameModel # NOQA from .regex_model import RegexModel # NOQA - return cls._BaseModel__subclasses.get(class_name.lower(), None) + return cls.__subclasses.get(class_name.lower(), None) def get_parameters(self, param_list: list[str] | None = None) -> dict: """ diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index 53588d949..bd06a59a4 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -49,16 +49,14 @@ def __init__(self, **parameters: Any) -> None: def _register_subclass(cls) -> None: """Register a subclass for the class factory.""" if not inspect.isabstract(cls): - cls._BaseDataProcessor__subclasses[ # type: ignore - cls.__name__.lower() - ] = cls + cls.__subclasses[cls.__name__.lower()] = cls @classmethod - def get_class(cls: type[Processor], class_name: str) -> type[Processor] | None: + def get_class( + cls: type[BaseDataProcessor], class_name: str + ) -> type[BaseDataProcessor] | None: """Get class of BaseDataProcessor object.""" - return cls._BaseDataProcessor__subclasses.get( # type: ignore - class_name.lower(), None - ) + return cls.__subclasses.get(class_name.lower(), None) def __eq__(self, other: object) -> bool: """ From 92346a308a54f9b509a837c086bd6a9feefc3c56 Mon Sep 17 00:00:00 2001 From: Junho Lee <53921230+junholee6a@users.noreply.github.com> Date: Thu, 20 Jul 2023 12:29:38 -0400 Subject: [PATCH 6/8] None-check labels dependants (#964) The mypy errors addressed here occur because variables label_mapping (in CharPreprocessor), unstructured_labels, and unstructured_label_set (in StructCharPreprocessor.process()) have optional types when they're used. This is fixed by checking that they are not None prior to the operation, which mypy recognizes as removing the None type from them. This should have no effect on functionality because we are already checking that labels is not None, and the variables above all depend on labels such that they are None only if labels is None. --- dataprofiler/labelers/data_processing.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index bd06a59a4..be1a3fee4 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -735,8 +735,8 @@ def process( X_train = np.array( [[sentence] for sentence in batch_data["samples"]], dtype=object ) - if labels is not None: - num_classes = max(label_mapping.values()) + 1 # type: ignore + if labels is not None and label_mapping is not None: + num_classes = max(label_mapping.values()) + 1 Y_train = tf.keras.utils.to_categorical( batch_data["labels"], num_classes @@ -1503,8 +1503,12 @@ def process( unstructured_label_set, ) = self.convert_to_unstructured_format(batch_data, batch_labels) unstructured_data[ind] = unstructured_text - if labels is not None: - unstructured_labels[ind] = unstructured_label_set # type: ignore + if ( + labels is not None + and unstructured_labels is not None + and unstructured_label_set is not None + ): + unstructured_labels[ind] = unstructured_label_set if labels is not None: np_unstruct_labels = np.array(unstructured_labels, dtype="object") From acb9c5efe49801034db05d527d46c03671da7723 Mon Sep 17 00:00:00 2001 From: clee1152 Date: Mon, 24 Jul 2023 11:22:03 -0400 Subject: [PATCH 7/8] Changed `publish-python-package.yml` to include only release branches. (#965) * Changed release option to only release branches named \'release/\'. * Reverted types --- .github/workflows/publish-python-package.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/publish-python-package.yml b/.github/workflows/publish-python-package.yml index d2b43419f..3c593d4ed 100644 --- a/.github/workflows/publish-python-package.yml +++ b/.github/workflows/publish-python-package.yml @@ -7,6 +7,8 @@ name: Publish Python Package on: release: types: [created] + branches: + - 'release/*' jobs: deploy: From 36fc74a1b89c19ac61afb8bd8eb0da4bb7536b56 Mon Sep 17 00:00:00 2001 From: Junho Lee Date: Mon, 24 Jul 2023 21:28:17 -0400 Subject: [PATCH 8/8] Delay transforming priority_order into ndarray In the changed code, we had a mypy error because numpy ndarrays are not compatible with random.Random.shuffle() (expected argument type is MutableSequence[Any]) We fix this by first instantiating priority_order as a list, then shuffling it, then creating an ndarray from it afterwards. --- dataprofiler/labelers/data_processing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index be1a3fee4..fabb7a08b 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -2047,9 +2047,9 @@ def process( elif aggregation_func == "random": num_labels = max(label_mapping.values()) + 1 random_state: random.Random = self._parameters["random_state"] - priority_order = np.array(list(range(num_labels))) - random_state.shuffle(priority_order) # type: ignore - self.priority_prediction(results, priority_order) + priority_order = list(range(num_labels)) + random_state.shuffle(priority_order) + self.priority_prediction(results, np.array(priority_order)) else: raise ValueError( f"`{aggregation_func}` is not a valid aggregation function"