diff --git a/synthetic_data/generators.py b/synthetic_data/generators.py index 2ec37198..b5afe217 100644 --- a/synthetic_data/generators.py +++ b/synthetic_data/generators.py @@ -96,6 +96,7 @@ def synthesize( def _generate_uncorrelated_column_data(self, num_samples): """Generate column data.""" + print(type(self.profile), "SHEEEEEESH WOW") columns = self.profile.report()["data_stats"] dataset = [] column_names = [] @@ -120,6 +121,7 @@ def _generate_uncorrelated_column_data(self, num_samples): if (generator_name == "string") or (generator_name == "text"): if col_.get("categorical", False): + print("ENTERED CATEGORICAL GEN") total = 0 for count in col["statistics"]["categorical_count"].values(): total += count @@ -156,11 +158,12 @@ def _generate_uncorrelated_column_data(self, num_samples): param_build[param[0]] = col_[param[0]] generated_data = generator_func(**param_build) + print(col_["order"]) if col_["order"] in sorting_types: dataset.append( self.get_ordered_column( generated_data, - generator_func, + generator_name, col_["order"], ) ) @@ -221,6 +224,7 @@ def get_ordered_column( :return: sorted numpy array """ + print(data_type) if data_type == "datetime": sorted_data = np.array(sorted(data, key=lambda x: x[1])) sorted_data = sorted_data[:, 0] diff --git a/tests/test_generators.py b/tests/test_generators.py index 934850a8..73cf712c 100644 --- a/tests/test_generators.py +++ b/tests/test_generators.py @@ -48,8 +48,6 @@ def test_synthesize_tabular(self): synthetic_data_2 = generator.synthesize(100) self.assertEqual(len(synthetic_data_2), 100) - # asserts that both methods create the same results - # if this ever fails may need to start setting seeds np.testing.assert_array_equal(synthetic_data, synthetic_data_2) @mock.patch("synthetic_data.generators.make_data_from_report") @@ -65,17 +63,43 @@ def test_synthesize_correlated_method(self, mock_make_data): correlated_tabular_generator.synthesize(num_samples=10) mock_make_data.assert_called_once() + def test_synthesize_uncorrelated_output(self): + generator = TabularGenerator(profile=self.profile, is_correlated=False, seed=42) + self.assertFalse(generator.is_correlated) + actual_synthetic_data = generator.synthesize(20) + + self.assertEqual(len(actual_synthetic_data), 20) + self.assertIsInstance(actual_synthetic_data, pd.DataFrame) + + np.testing.assert_array_equal( + actual_synthetic_data.columns.values, + np.array(['datetime', 'string', 'int', 'float', 'text'], dtype="object"), + ) # @mock.patch("generate_uncorrelated_column_data.TabularGenerator", spec=TabularGenerator) class TestGenerateUncorrelatedColumnData(unittest.TestCase): - # @staticmethod - # def setup_tabular_generator_mock(mock_generator): - # mock_DataLabeler = mock_generator.return_value + @classmethod + def setUpClass(cls): + cls.profile_options = dp.ProfilerOptions() + cls.profile_options.set( + { + "data_labeler.is_enabled": False, + "correlation.is_enabled": True, + "multiprocess.is_enabled": False, + } + ) + dp.set_seed(0) - def setUp(self): - self.dataset_length = 10 - self.rng = np.random.Generator(np.random.PCG64(12345)) - self.columns_to_gen = [ + # create dataset and profile for tabular + cls.data = dp.Data(os.path.join(test_dir, "data/tabular.csv")) + cls.profile = dp.Profiler( + data=cls.data, + options=cls.profile_options, + samples_per_update=len(cls.data), + ) + cls.dataset_length = 10 + cls.rng = np.random.Generator(np.random.PCG64(12345)) + cls.columns_to_gen = [ {"generator": "integer", "name": "int", "min_value": 4, "max_value": 88}, { "generator": "datetime", @@ -105,7 +129,13 @@ def setUp(self): "sig_figs": 3, }, ] + + # @staticmethod + # def setup_tabular_generator_mock(mock_generator): + # mock_DataLabeler = mock_generator.return_value + + # TEST PARAM_BUILD @mock.patch("synthetic_data.generators.random_integers") @mock.patch("synthetic_data.generators.random_floats") @mock.patch("synthetic_data.generators.random_categorical") @@ -123,7 +153,6 @@ def test_generate_uncorrelated_column_data( mock_random_floats, mock_random_integers, ): - """Test the param_build""" generator = TabularGenerator(profile=self.profile, is_correlated=False, seed=42) self.assertFalse(generator.is_correlated) expected_calls = [ @@ -276,80 +305,117 @@ def test_generate_uncorrelated_column_data( else: self.assertEqual(call_args_list[key], expected_calls[j][key]) - def test_get_ordered_column_integration(self): - columns_to_gen = [ + + # mock the report to have to columns and then check to see if the (output of synthesize).values == the expected_df.values containing have the sorted expected stuff + @mock.patch("dataprofiler.profilers.StructuredProfiler.report") + def test_get_ordered_column_integration(self, mock_report): + mock_report.return_value = {"data_stats": [ { - "generator": "integer", - "name": "int", - "min_value": 4, - "max_value": 88, + "data_type": "int", "order": "ascending", + "statistics": { + "min": 1.0, + "max": 4.0, + }, }, { - "generator": "datetime", - "name": "dat", - "date_format_list": ["%Y-%m-%d"], - "start_date": pd.Timestamp(2001, 12, 22), - "end_date": pd.Timestamp(2022, 12, 22), + "data_type": "string", + "categorical": False, "order": "ascending", + "statistics": { + "min": 4.0, + "max": 5.0, + "vocab": ['q', 'p', 'a', 'w', 'e', 'r', 'i', 's', 'd', 'f'] + }, }, { - "generator": "text", - "name": "txt", - "chars": ["0", "1"], - "str_len_min": 2, - "str_len_max": 5, + "data_type": "string", + "categorical": True, "order": "ascending", + "statistics": { + "min": 10, + "max": 13, + "categorical_count": {"red": 1, "blue": 2, "yellow": 1, "orange": 3}, + "categories": ["blue", "yellow", "red", "orange"] + } }, { - "generator": "categorical", - "name": "cat", - "categories": ["X", "Y", "Z"], - "probabilities": [0.1, 0.5, 0.4], + "data_type": "float", "order": "ascending", + "statistics": { + "min": 2.11234, + "max": 8.0, + "precision": {"max": 6} + }, }, { - "generator": "float", - "name": "flo", - "min_value": 3, - "max_value": 10, - "sig_figs": 3, + "data_type": "datetime", "order": "ascending", - }, - ] - expected_data = [ - np.array([21, 23, 30, 36, 57, 60, 62, 70, 70, 87]), - np.array( - [ - "2003-12-27", - "2005-11-23", - "2007-03-10", - "2008-12-17", - "2011-04-02", - "2014-07-16", - "2015-12-26", - "2016-02-07", - "2021-10-01", - "2021-11-24", - ] - ), - np.array( - ["00", "000", "0001", "01", "0100", "10", "10", "100", "1110", "1111"] - ), - np.array(["Y", "Y", "Y", "Y", "Y", "Y", "Z", "Z", "Z", "Z"]), - np.array( - [3.035, 3.477, 4.234, 4.812, 4.977, 5.131, 5.379, 5.488, 7.318, 7.4] - ), - ] + "statistics": { + "format": ['%Y-%m-%d'], + "min": '2000-12-09', + "max": '2030-04-23' + } + } + ] + } + generator = TabularGenerator(profile=self.profile, is_correlated=False, seed=42) + self.assertFalse(generator.is_correlated) + + expected_df = [np.array([1, '*|Z+Y&,q(ZH', 2.194392, '2001-09-01']), + np.array([1, '64cCO{nts,G', 2.829648, '2004-01-18']), + np.array([1, '810I1-c5Chp}', 2.835097, '2004-07-28']), + np.array([1, '@I)<@V@Lxs', 2.888464, '2004-10-14']), + np.array([1, "C+f mj@I'(k", 2.95956, '2006-10-21']), + np.array([2, 'I97I)n,DRuRf', 3.014826, '2009-05-08']), + np.array([2, 'N"JQ3-Qc]~3q', 3.167811, '2009-10-11']), + np.array([2, "Nc0dK!!LaX '", 3.268881, '2012-08-12']), + np.array([3, 'PfCKI+&$r&P', 3.464477, '2015-04-16']), + np.array([3, 'Qs=B*u&!pd7N', 3.531093, '2017-03-30']), + np.array([3, 'Z`fzx