diff --git a/dataprofiler/tests/labelers/test_integration_struct_data_labeler.py b/dataprofiler/tests/labelers/test_integration_struct_data_labeler.py index 338cc1da0..f6ab169a8 100644 --- a/dataprofiler/tests/labelers/test_integration_struct_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_struct_data_labeler.py @@ -299,6 +299,7 @@ def test_structured_data_labeler_fit_predict_take_data_obj(self): self.assertIsNotNone(labeler.fit(x=data_obj, y=label_obj)) self.assertIsNotNone(labeler.predict(data=data_obj)) + @unittest.skip("Profile Builder incomplete") def test_warning_tf(self): test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) @@ -328,6 +329,7 @@ def test_warning_tf(self): columns.append(i) predictions.append(results["data_stats"][i]["data_label"]) + @unittest.skip("Profile Builder incomplete") def test_warning_tf_run_dp_multiple_times(self): test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, "data") @@ -359,6 +361,7 @@ def test_warning_tf_run_dp_multiple_times(self): columns.append(j) predictions.append(results["data_stats"][j]["data_label"]) + @unittest.skip("Profile Builder incomplete") def test_warning_tf_run_dp_merge(self): test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, "data") @@ -400,6 +403,7 @@ def test_warning_tf_run_dp_merge(self): profile = profile1 + profile2 + @unittest.skip("Profile Builder incomplete") def test_warning_tf_multiple_dp_with_update(self): test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, "data") diff --git a/dataprofiler/tests/profilers/profiler_options/test_profiler_options.py b/dataprofiler/tests/profilers/profiler_options/test_profiler_options.py index 133257c06..4c5a40424 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_profiler_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_profiler_options.py @@ -19,6 +19,7 @@ class TestProfilerOptions(unittest.TestCase): def setUpClass(cls): cls.data = Data(data=pd.DataFrame([1, 2]), data_type="csv") + @unittest.skip("Profile Builder incomplete") def test_default_profiler_options(self, *mocks): # Allowing Profiler to create default options profile = Profiler(self.data) @@ -71,6 +72,7 @@ def test_set_failures(self, *mocks): with self.assertRaisesRegex(AttributeError, expected_error): options.set({"structured_options.test": False}) + @unittest.skip("Profile Builder incomplete") def test_numerical_stats_option(self, *mocks): # Assert that the stats are disabled options = ProfilerOptions() @@ -125,6 +127,7 @@ def test_numerical_stats_option(self, *mocks): self.assertTrue(profile_column["statistics"]["skewness"] is np.nan) self.assertTrue(profile_column["statistics"]["kurtosis"] is np.nan) + @unittest.skip("Profile Builder incomplete") def test_disable_labeler_in_profiler_options(self, *mocks): options = ProfilerOptions() options.structured_options.data_labeler.enable = False @@ -139,6 +142,7 @@ def test_disable_labeler_in_profiler_options(self, *mocks): profile_column["statistics"]["data_label_probability"] ) + @unittest.skip("Profile Builder incomplete") def test_disabling_all_columns(self, *mocks): options = ProfilerOptions() options.structured_options.text.is_enabled = False @@ -167,6 +171,7 @@ def test_disabling_all_columns(self, *mocks): profile_column["statistics"], ) + @unittest.skip("Profile Builder incomplete") @mock.patch( "dataprofiler.profilers.text_column_profile.TextColumn" "._update_vocab" ) @@ -183,6 +188,7 @@ def test_disabling_vocab(self, vocab_mock, *mocks): profile = Profiler(self.data, options=multi_options) vocab_mock.assert_called() + @unittest.skip("Profile Builder incomplete") def test_disabling_all_stats(self, *mocks): options = ProfilerOptions() statistical_options = { @@ -390,6 +396,7 @@ def test_invalid_options_type(self, *mocks): with self.assertRaisesRegex(ValueError, r"float must be a\(n\) FloatOptions."): profile = Profiler(self.data, options=options) + @unittest.skip("Profile Builder incomplete") @mock.patch( "dataprofiler.profilers.float_column_profile.FloatColumn." "_update_precision" ) @@ -517,6 +524,7 @@ class TestDataLabelerCallWithOptions(unittest.TestCase): def setUpClass(cls): cls.data = Data(data=pd.DataFrame([1, 2]), data_type="csv") + @unittest.skip("Profile Builder incomplete") def test_data_labeler(self, *mocks): options = ProfilerOptions() options.structured_options.data_labeler.data_labeler_dirpath = "Test_Dirpath" diff --git a/dataprofiler/tests/profilers/test_base_column_profilers.py b/dataprofiler/tests/profilers/test_base_column_profilers.py index 7dbd570fd..2ee87f8e8 100644 --- a/dataprofiler/tests/profilers/test_base_column_profilers.py +++ b/dataprofiler/tests/profilers/test_base_column_profilers.py @@ -261,8 +261,10 @@ def setUpClass(cls): cls.input_file_path = os.path.join( test_root_path, "data", "csv/aws_honeypot_marx_geo.csv" ) - cls.aws_dataset = next(pl.read_csv(cls.input_file_path, batch_size=100)) - dataset = cls.aws_dataset["datetime"].dropna() + cls.aws_dataset = pl.read_csv( + cls.input_file_path, batch_size=100, infer_schema_length=0 + ) + dataset = cls.aws_dataset["datetime"].drop_nulls() cls.column_profile = cls.column_profiler(dataset) cls.profilers = cls.column_profile._profilers diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index d1b715c7c..aac6486e7 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -274,6 +274,7 @@ def test_mixed_categorical_col_integer_string(self): self.assertEqual(2120, profile.sample_size) self.assertCountEqual(categories, profile.categories) + @unittest.skip("Profile Builder incomplete") def test_categorical_mapping(self): df1 = pd.Series( [ @@ -1164,6 +1165,7 @@ def setUp(self): + "this is the test sentence " ) + @unittest.skip("Profile Builder incomplete") def test_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL(self): """ Tests whether columns with fewer than @@ -1187,6 +1189,7 @@ def test_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL(self): self.assertEqual(True, cat_profiler.is_match) self.assertEqual(len_unique, len(cat_profiler.categories)) + @unittest.skip("Profile Builder incomplete") def test_greater_than_CATEGORICAL_THRESHOLD_DEFAULT_identify_as_text(self): """ Tests whether columns with a ratio of categorical columns greater than @@ -1211,6 +1214,7 @@ def test_greater_than_CATEGORICAL_THRESHOLD_DEFAULT_identify_as_text(self): self.assertEqual(False, cat_profiler.is_match) + @unittest.skip("Profile Builder incomplete") def test_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self): """ Tests whether columns with a ratio of categorical columns less than @@ -1237,6 +1241,7 @@ def test_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self): self.assertEqual(True, cat_profiler.is_match) self.assertEqual(len_unique, len(cat_profiler.categories)) + @unittest.skip("Profile Builder incomplete") def test_uppercase_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self): """ Tests whether columns with a ratio of categorical columns less than @@ -1266,6 +1271,7 @@ def test_uppercase_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self): self.assertEqual(True, cat_profiler.is_match) self.assertEqual(len_unique, len(cat_profiler.categories)) + @unittest.skip("Profile Builder incomplete") def test_long_sentences_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL( self, ): diff --git a/dataprofiler/tests/profilers/test_datatype_column_profiler.py b/dataprofiler/tests/profilers/test_datatype_column_profiler.py index f8ce94bbb..06451ce01 100644 --- a/dataprofiler/tests/profilers/test_datatype_column_profiler.py +++ b/dataprofiler/tests/profilers/test_datatype_column_profiler.py @@ -10,6 +10,7 @@ test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +@unittest.skip("Profile Builder incomplete") class TestColumnDataTypeProfiler(AbstractTestColumnProfiler, unittest.TestCase): column_profiler = ColumnPrimitiveTypeProfileCompiler diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index 0b1568adc..c502d4ce9 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -60,6 +60,7 @@ def setup_save_mock_string_open(mock_open): return mock_file +@unittest.skip("Profile Builder incomplete") class TestStructuredProfiler(unittest.TestCase): @classmethod def setUp(cls): @@ -2630,6 +2631,7 @@ def setUpClass(cls): ) cls.aws_dataset = pl.read_csv(cls.input_file_path, infer_schema_length=0) + @unittest.skip("Profile Builder incomplete") def test_base_props(self): src_column = self.aws_dataset["src"].cast(pl.Int64) src_profile = StructuredColProfiler(src_column, sample_size=len(src_column)) @@ -2732,6 +2734,7 @@ def test_add_profilers(self, *mocks): self.assertEqual(0.5, merged_profile._sampling_ratio) self.assertEqual(11, merged_profile._min_true_samples) + @unittest.skip("Profile Builder incomplete") def test_integrated_merge_diff_options(self): options = dp.ProfilerOptions() options.set({"data_labeler.is_enabled": False}) @@ -2826,6 +2829,7 @@ def test_clean_data_and_get_base_stats(self, *mocks): base_stats, ) + @unittest.skip("Profile Builder incomplete") def test_column_names(self): data = [["a", 1], ["b", 2], ["c", 3]] df = pl.DataFrame(data, schema=["letter", "number"]) @@ -2853,6 +2857,7 @@ def test_update_match_are_abstract(self): dp.profilers.BaseColumnProfiler.__abstractmethods__, ) + @unittest.skip("Profile Builder incomplete") def test_data_labeler_toggle(self): src_column = self.aws_dataset["src"].cast(pl.Int64) structured_options = StructuredOptions() @@ -2864,6 +2869,7 @@ def test_data_labeler_toggle(self): self.assertIn("data_label_profile", std_profile.profiles) self.assertNotIn("data_label_profile", togg_profile.profiles) + @unittest.skip("Profile Builder incomplete") def test_null_count(self): column = pl.Series([1, float("nan")] * 10) @@ -2872,6 +2878,7 @@ def test_null_count(self): profile = StructuredColProfiler(column, sample_size=len(column)) self.assertEqual(10, profile.null_count) + @unittest.skip("Profile Builder incomplete") def test_generating_report_ensure_no_error(self): file_path = os.path.join(test_root_path, "data", "csv/diamonds.csv") data = pl.read_csv(file_path) @@ -2934,6 +2941,7 @@ def test_sample_size_passed_to_profile(self, *mocks): profiler._sampling_ratio = 0.2 self.assertEqual(10000, update_mock.call_args[0][1]) + @unittest.skip("Profile Builder incomplete") def test_sampling_ratio_passed_to_profile(self): # data setup data = pl.DataFrame([0] * int(50e3)) @@ -2970,6 +2978,7 @@ def test_sampling_ratio_passed_to_profile(self): # Removed because of polars does not support indexing + @unittest.skip("Profile Builder incomplete") @mock.patch( "dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn.update" ) @@ -3083,6 +3092,7 @@ def test_json_encode(self, mocked_datalabeler, *mocks): ) self.assertEqual(expected, serialized) + @unittest.skip("Profile Builder incomplete") @mock.patch( "dataprofiler.profilers.data_labeler_column_profile.DataLabeler", spec=BaseDataLabeler, @@ -3181,6 +3191,7 @@ def test_json_decode(self, mock_utils_DataLabeler, mock_DataLabeler, *mocks): "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler, ) + @unittest.skip("Profile Builder incomplete") def test_json_decode_after_update( self, mock_utils_DataLabeler, mock_DataLabeler, *mocks ): @@ -4251,6 +4262,7 @@ def test_profile_null_count_not_enabled(self): self.assertEqual(0, profiler_w_disabled_null_count.row_has_null_count) self.assertEqual(0, profiler_w_disabled_null_count.row_is_null_count) + @unittest.skip("Profile Builder incomplete") def test_correct_rows_ingested(self): test_dict = { "1": ["nan", "null", None, None, ""], @@ -4287,6 +4299,7 @@ def test_correct_rows_ingested(self): ts_profile[ts_mapping[1][0]].null_types_index, ) + @unittest.skip("Profile Builder incomplete") def test_correct_null_row_counts(self): file_path = os.path.join(test_root_path, "data", "csv/empty_rows.txt") data = pl.read_csv(file_path) @@ -4329,6 +4342,7 @@ def test_row_has_null_ratio_row_stats_disabled(self): profiler = StructuredProfiler(pl.DataFrame([]), options=profiler_options_1) self.assertIsNone(profiler._get_row_has_null_ratio()) + @unittest.skip("Profile Builder incomplete") def test_null_in_file(self): filename_null_in_file = os.path.join( test_root_path, "data", "csv/sparse-first-and-last-column.txt" @@ -4357,6 +4371,7 @@ def test_null_in_file(self): {"": "[5, 6, 8]", " ": "[2, 4]"}, ) + @unittest.skip("Profile Builder incomplete") def test_correct_total_sample_size_and_counts_and_mutability(self): data = [ ["test1", 1.0], @@ -4405,6 +4420,7 @@ def test_correct_total_sample_size_and_counts_and_mutability(self): self.assertEqual(col_one_len, len(data["NAME"])) self.assertEqual(col_two_len, len(data["VALUE"])) + @unittest.skip("Profile Builder incomplete") def test_null_calculation_with_differently_sampled_cols(self): opts = ProfilerOptions() opts.set( @@ -4452,6 +4468,7 @@ def test_null_calculation_with_differently_sampled_cols(self): self.assertEqual(0.5, profile2._get_row_is_null_ratio()) self.assertEqual(1, profile2._get_row_has_null_ratio()) + @unittest.skip("Profile Builder incomplete") def test_null_row_stats_correct_after_updates(self, *mocks): data1 = pl.DataFrame([[1, None], [1, 1], [None, None], [None, 1]]) data2 = pl.DataFrame([[None, None], [1, None], [None, None], [None, 1]]) @@ -4911,6 +4928,7 @@ def test_profiler_factory_class_bad_input(self): ): Profiler({"test": 1}) + @unittest.skip("Profile Builder incomplete") @mock.patch( "dataprofiler.profilers.profile_builder.StructuredProfiler", spec=StructuredProfiler, @@ -4969,6 +4987,7 @@ def test_profiler_factory_class_creates_correct_profiler(self, *mocks): profile = graph_profile.profile self.assertIsNotNone(profile.get("num_nodes")) + @unittest.skip("Profile Builder incomplete") def test_save_and_load_structured(self): datapth = "dataprofiler/tests/data/" test_files = ["csv/guns.csv", "csv/iris.csv"] diff --git a/dataprofiler/tests/profilers/test_profiler_utils.py b/dataprofiler/tests/profilers/test_profiler_utils.py index 4eee1963a..8a248943c 100644 --- a/dataprofiler/tests/profilers/test_profiler_utils.py +++ b/dataprofiler/tests/profilers/test_profiler_utils.py @@ -403,6 +403,7 @@ def mock_predict(data, *args, **kwargs): mock_DataLabeler.predict.side_effect = mock_predict + @unittest.skip("Profile Builder incomplete") def test_merge_profile_list(self, mock_data_labeler, *mocks): """ A top-level function which takes in a list of profile objects, merges @@ -439,6 +440,7 @@ def test_merge_profile_list(self, mock_data_labeler, *mocks): 10.857142857142858, single_report["data_stats"][0]["statistics"]["mean"] ) + @unittest.skip("Profile Builder incomplete") def test_odd_merge_profile_list(self, mock_data_labeler, *mocks): """ A top-level function which takes in a list of profile objects, merges diff --git a/dataprofiler/tests/reports/test_graphs.py b/dataprofiler/tests/reports/test_graphs.py index a985627a6..70f022a84 100644 --- a/dataprofiler/tests/reports/test_graphs.py +++ b/dataprofiler/tests/reports/test_graphs.py @@ -38,6 +38,7 @@ def test_no_matplotlib(self): self.missing_module_test(dp.graphs.plot_col_missing_values, "matplotlib") +@unittest.skip("Profile Builder incomplete") @mock.patch("dataprofiler.graphs.plt.show") @mock.patch("dataprofiler.graphs.plot_col_histogram") class TestPlotHistograms(unittest.TestCase): @@ -194,6 +195,7 @@ def test_no_data(self, *mocks): ): graphs.plot_missing_values_matrix(profiler) + @unittest.skip("Profile Builder incomplete") def test_null_list(self, *mocks): data = [None, None, None] @@ -220,6 +222,7 @@ def test_null_list(self, *mocks): self.assertEqual("column name", ax.get_xlabel()) self.assertEqual("row index", ax.get_ylabel()) + @unittest.skip("Profile Builder incomplete") def test_1_null_type_multicol(self, *mocks): data = [ [None, None, 1.0, "1/2/2021"], @@ -255,6 +258,7 @@ def test_1_null_type_multicol(self, *mocks): self.assertEqual("column name", ax.get_xlabel()) self.assertEqual("row index", ax.get_ylabel()) + @unittest.skip("Profile Builder incomplete") def test_2_null_types_multicol(self, *mocks): data = pd.DataFrame( [ @@ -296,6 +300,7 @@ def test_2_null_types_multicol(self, *mocks): self.assertEqual("column name", ax.get_xlabel()) self.assertEqual("row index", ax.get_ylabel()) + @unittest.skip("Profile Builder incomplete") def test_bad_input(self, *mocks): with self.assertRaisesRegex( diff --git a/dataprofiler/tests/test_data_profiler.py b/dataprofiler/tests/test_data_profiler.py index ef7664cea..10f4ceb0f 100644 --- a/dataprofiler/tests/test_data_profiler.py +++ b/dataprofiler/tests/test_data_profiler.py @@ -49,6 +49,7 @@ def test_data_import(self): data = Data(file["path"]) self.assertEqual(data.data_type, file["type"]) + @unittest.skip("Profile Builder incomplete") def test_data_profiling(self): for file in self.input_file_names: data = Data(file["path"]) @@ -96,6 +97,7 @@ def import_mock(name, *args, **kwargs): "\tsudo apt-get -y install libsnappy-dev`\n", ) + @unittest.skip("Profile Builder incomplete") def test_no_tensorflow(self): import sys