diff --git a/synthetic_data/dataset_generator.py b/synthetic_data/dataset_generator.py new file mode 100644 index 00000000..f73e94c9 --- /dev/null +++ b/synthetic_data/dataset_generator.py @@ -0,0 +1,128 @@ +"""Contains generator that returns collective df of requested distinct generators.""" + +import copy +from typing import List, Optional + +import numpy as np +import pandas as pd +from numpy.random import Generator + +from synthetic_data.distinct_generators.categorical_generator import random_categorical +from synthetic_data.distinct_generators.datetime_generator import random_datetimes +from synthetic_data.distinct_generators.float_generator import random_floats +from synthetic_data.distinct_generators.int_generator import random_integers +from synthetic_data.distinct_generators.text_generator import random_string, random_text + + +def convert_data_to_df( + np_data: np.array, + path: Optional[str] = None, + index: bool = False, + column_names: Optional[List[str]] = None, +) -> pd.DataFrame: + """ + Convert np array to a pandas dataframe. + + :param np_data: np array to be converted + :type np_data: numpy array + :param path: path to output a csv of the dataframe generated + :type path: str, None, optional + :param index: whether to include index in output to csv + :type path: bool, optional + :param column_names: The names of the columns of a dataset + :type path: List, None, optional + :return: a pandas dataframe + """ + # convert array into dataframe + if not column_names: + column_names = [x for x in range(len(np_data))] + dataframe = pd.DataFrame.from_dict(dict(zip(column_names, np_data))) + + # save the dataframe as a csv file + if path: + dataframe.to_csv(path, index=index, encoding="utf-8") + return dataframe + + +def get_ordered_column( + data: np.array, + data_type: str, + order: str = "ascending", +) -> np.array: + """Sort a numpy array based on data type. + + :param data: numpy array to be sorted + :type data: np.array + + :return: sorted numpy array + """ + if data_type == "datetime": + sorted_data = np.array(sorted(data, key=lambda x: x[1])) + sorted_data = sorted_data[:, 0] + + else: + sorted_data = np.sort(data) + + if order == "descending": + return sorted_data[::-1] + return sorted_data + + +def generate_dataset_by_class( + rng: Generator, + columns_to_generate: Optional[List[dict]] = None, + dataset_length: int = 100000, + path: Optional[str] = None, +) -> pd.DataFrame: + """Randomly generate a dataset with a mixture of different data classes. + + :param rng: the np rng object used to generate random values + :type rng: numpy Generator + :param columns_to_generate: Classes of data to be included in the dataset + :type columns_to_generate: List[dict], None, optional + :param dataset_length: length of the dataset generated + :type dataset_length: int, optional + :param path: path to output a csv of the dataframe generated + :type path: str, None, optional + :param ordered: whether to generate ordered data + :type ordered: bool, optional + + :return: pandas DataFrame + """ + gen_funcs = { + "integer": random_integers, + "float": random_floats, + "categorical": random_categorical, + "text": random_text, + "datetime": random_datetimes, + "string": random_string, + } + + dataset = [] + for col in columns_to_generate: + col_ = copy.deepcopy(col) + data_type_var = col_.get("data_type", None) + if data_type_var not in gen_funcs: + raise ValueError(f"generator: {data_type_var} is not a valid generator.") + + col_generator_function = gen_funcs.get(data_type_var) + generated_data = col_generator_function( + **col_, num_rows=dataset_length, rng=rng + ) + sort = col_.get("ordered", None) + + if sort in ["ascending", "descending"]: + dataset.append( + get_ordered_column( + generated_data, + data_type_var, + sort, + ) + ) + else: + if data_type_var == "datetime": + date = generated_data[:, 0] + dataset.append(date) + else: + dataset.append(generated_data) + return convert_data_to_df(dataset, path) diff --git a/synthetic_data/distinct_generators/datetime_generator.py b/synthetic_data/distinct_generators/datetime_generator.py index 1d5cbaa0..cef8c5fd 100644 --- a/synthetic_data/distinct_generators/datetime_generator.py +++ b/synthetic_data/distinct_generators/datetime_generator.py @@ -1,5 +1,7 @@ """Contains a datetime generator.""" -from typing import Optional +import random +from datetime import datetime +from typing import List, Optional import numpy as np import pandas as pd @@ -11,7 +13,7 @@ def generate_datetime( date_format: str, start_date: pd.Timestamp = pd.Timestamp(1920, 1, 1), end_date: pd.Timestamp = pd.Timestamp(2049, 12, 31), -) -> str: +) -> list: """ Generate datetime given the random_state, date_format, and start/end dates. @@ -32,13 +34,14 @@ def generate_datetime( """ t = rng.random() ptime = start_date + t * (end_date - start_date) + date_string = ptime.strftime(date_format) - return ptime.strftime(date_format) + return [date_string, datetime.strptime(date_string, date_format)] def random_datetimes( rng: Generator, - date_format_list: Optional[str] = None, + date_format_list: Optional[List[str]] = None, start_date: pd.Timestamp = None, end_date: pd.Timestamp = None, num_rows: int = 1, @@ -48,9 +51,9 @@ def random_datetimes( :param rng: the np rng object used to generate random values :type rng: numpy Generator - :param date_format: the format that the generated datatime will follow, + :param date_format_list: the format that the generated datatime will follow, defaults to None - :type date_format: str, None, optional + :type date_format: List, None, optional :param start_date: the earliest date that datetimes can be generated at, defaults to pd.Timestamp(1920, 1, 1) :type start_date: pd.Timestamp, None, optional @@ -66,7 +69,7 @@ def random_datetimes( date_format_list = ["%B %d %Y %H:%M:%S"] for i in range(num_rows): - date_format = rng.choice(date_format_list) + date_format = random.choice(date_format_list) datetime = generate_datetime( rng, date_format=date_format, start_date=start_date, end_date=end_date ) diff --git a/tests/distinct_generators/test_datetime_generator.py b/tests/distinct_generators/test_datetime_generator.py index 5c10ffba..135eac90 100644 --- a/tests/distinct_generators/test_datetime_generator.py +++ b/tests/distinct_generators/test_datetime_generator.py @@ -1,4 +1,5 @@ import unittest +from datetime import datetime import numpy as np import pandas as pd @@ -12,58 +13,80 @@ def setUp(self): self.rng = Generator(PCG64(12345)) self.start_date = pd.Timestamp(2001, 12, 22) self.end_date = pd.Timestamp(2023, 1, 1) - self.date_format_list = ["%Y-%m-%d", "%d-%m-%Y"] - - def test_generate_datetime_return_type(self): - date_str = date_generator.generate_datetime( + self.date_format_list = ["%Y %m %d"] + self.generate_datetime_output = date_generator.generate_datetime( self.rng, self.date_format_list[0], self.start_date, self.end_date ) - self.assertIsInstance(date_str, str) + self.random_datetimes_output = date_generator.random_datetimes( + self.rng, self.date_format_list, self.start_date, self.end_date, 10 + ) + + def test_generate_datetime_return_type(self): + self.assertIsInstance(self.generate_datetime_output, list) + self.assertIsInstance(self.generate_datetime_output[0], str) + self.assertIsInstance(self.generate_datetime_output[1], datetime) def test_generate_datetime_format(self): - date_str = date_generator.generate_datetime( - self.rng, self.date_format_list[0], self.start_date, self.end_date - ) try: - pd.to_datetime(date_str, format=self.date_format_list[0]) + pd.to_datetime( + self.generate_datetime_output[1], + format=self.generate_datetime_output[0], + ) except ValueError: self.fail("pd.to_datetime() raised ValueError unexpectedly") def test_generate_datetime_range(self): - date_str = date_generator.generate_datetime( - self.rng, self.date_format_list[0], self.start_date, self.end_date + date_obj = pd.to_datetime( + self.generate_datetime_output[1], format=self.generate_datetime_output[0] ) - date_obj = pd.to_datetime(date_str, format=self.date_format_list[0]) self.assertTrue(self.start_date <= date_obj) self.assertTrue(date_obj <= self.end_date) def test_random_datetimes_return_type_and_size(self): - result = date_generator.random_datetimes( - self.rng, self.date_format_list, self.start_date, self.end_date, 5 - ) - self.assertIsInstance(result, np.ndarray) - self.assertEqual(result.shape[0], 5) + + self.assertIsInstance(self.random_datetimes_output, np.ndarray) + self.assertEqual(self.random_datetimes_output.shape[0], 10) def test_random_datetimes_default_format_usage(self): - result = date_generator.random_datetimes( - self.rng, None, self.start_date, self.end_date, 10 - ) - for date_str in result: + for date in self.random_datetimes_output: try: - pd.to_datetime(date_str, format="%B %d %Y %H:%M:%S") + pd.to_datetime(date[1], format=date[0]) except ValueError: self.fail("pd.to_datetime() raised ValueError unexpectedly") def test_random_datetimes_format_usage(self): - result = date_generator.random_datetimes( - self.rng, self.date_format_list, self.start_date, self.end_date, 10 - ) - format_success = [False] * len(self.date_format_list) - for date_str in result: - for i, date_format in enumerate(self.date_format_list): + date_formats = ["%Y-%m-%d", "%B %d %Y %H:%M:%S"] + format_success = [False] * len(date_formats) + for date in self.random_datetimes_output: + for i in range(len(date_formats)): try: - pd.to_datetime(date_str, format=date_format) + pd.to_datetime(date[1], format=date[0]) format_success[i] = True except ValueError: pass self.assertGreater(sum(format_success), 1) + + def test_random_datetimes_output(self): + outputs = [ + np.array(["2008 08 19", datetime(2008, 8, 19, 0, 0)]), + np.array(["2018 09 27", datetime(2018, 9, 27, 0, 0)]), + np.array(["2016 03 11", datetime(2016, 3, 11, 0, 0)]), + np.array(["2010 03 13", datetime(2010, 3, 13, 0, 0)]), + np.array(["2008 12 21", datetime(2008, 12, 21, 0, 0)]), + np.array(["2014 07 22", datetime(2014, 7, 22, 0, 0)]), + np.array(["2005 11 25", datetime(2005, 11, 25, 0, 0)]), + np.array(["2016 02 13", datetime(2016, 2, 13, 0, 0)]), + np.array(["2021 10 11", datetime(2021, 10, 11, 0, 0)]), + np.array(["2007 03 12", datetime(2007, 3, 12, 0, 0)]), + ] + for i in range(len(self.random_datetimes_output)): + np.testing.assert_array_equal(self.random_datetimes_output[i], outputs[i]) + + def test_generate_datetime_output(self): + self.assertTrue(self.generate_datetime_output[0] == "2006 10 02") + self.assertTrue( + self.generate_datetime_output[1] + == datetime.strptime( + self.generate_datetime_output[0], self.date_format_list[0] + ) + ) diff --git a/tests/test_dataset_generator.py b/tests/test_dataset_generator.py new file mode 100644 index 00000000..17254477 --- /dev/null +++ b/tests/test_dataset_generator.py @@ -0,0 +1,260 @@ +"""Contains tests for dataset_generator""" + +import unittest +from collections import OrderedDict +from datetime import datetime + +import dataprofiler as dp +import numpy as np +import pandas as pd +from numpy.random import PCG64, Generator + +from synthetic_data import dataset_generator as dg +from synthetic_data.distinct_generators import datetime_generator as dategen + + +class TestDatasetGenerator(unittest.TestCase): + def setUp(self): + self.rng = Generator(PCG64(12345)) + self.start_date = pd.Timestamp(2001, 12, 22) + self.end_date = pd.Timestamp(2023, 1, 1) + self.date_format_list = ["%B %d %Y %H:%M:%S"] + + def test_get_ordered_column_datetime_ascending(self): + data = dategen.random_datetimes( + rng=self.rng, start_date=self.start_date, end_date=self.end_date, num_rows=5 + ) + + ordered_data = np.array( + [ + [ + "October 02 2006 22:34:32", + datetime.strptime( + "October 02 2006 22:34:32", self.date_format_list[0] + ), + ], + [ + "August 19 2008 16:53:49", + datetime.strptime( + "August 19 2008 16:53:49", self.date_format_list[0] + ), + ], + [ + "March 13 2010 17:18:44", + datetime.strptime( + "March 13 2010 17:18:44", self.date_format_list[0] + ), + ], + [ + "March 11 2016 15:15:39", + datetime.strptime( + "March 11 2016 15:15:39", self.date_format_list[0] + ), + ], + [ + "September 27 2018 18:24:03", + datetime.strptime( + "September 27 2018 18:24:03", self.date_format_list[0] + ), + ], + ] + ) + + ordered_data = ordered_data[:, 0] + output_data = dg.get_ordered_column(data, "datetime", "ascending") + + np.testing.assert_array_equal(output_data, ordered_data) + + def test_get_ordered_column_datetime_descending(self): + data = dategen.random_datetimes( + rng=self.rng, start_date=self.start_date, end_date=self.end_date, num_rows=5 + ) + + ordered_data = np.array( + [ + [ + "September 27 2018 18:24:03", + datetime.strptime( + "September 27 2018 18:24:03", self.date_format_list[0] + ), + ], + [ + "March 11 2016 15:15:39", + datetime.strptime( + "March 11 2016 15:15:39", self.date_format_list[0] + ), + ], + [ + "March 13 2010 17:18:44", + datetime.strptime( + "March 13 2010 17:18:44", self.date_format_list[0] + ), + ], + [ + "August 19 2008 16:53:49", + datetime.strptime( + "August 19 2008 16:53:49", self.date_format_list[0] + ), + ], + [ + "October 02 2006 22:34:32", + datetime.strptime( + "October 02 2006 22:34:32", self.date_format_list[0] + ), + ], + ] + ) + + ordered_data = ordered_data[:, 0] + output_data = dg.get_ordered_column(data, "datetime", "descending") + + np.testing.assert_array_equal(output_data, ordered_data) + + def test_get_ordered_column_custom_datetime_ascending(self): + custom_date_format = ["%Y %m %d"] + data = dategen.random_datetimes( + rng=self.rng, + date_format_list=custom_date_format, + start_date=self.start_date, + end_date=self.end_date, + num_rows=5, + ) + + ordered_data = np.array( + [ + [ + "2006 10 02", + datetime.strptime("2006 10 02", custom_date_format[0]), + ], + [ + "2008 08 19", + datetime.strptime("2008 08 19", custom_date_format[0]), + ], + [ + "2010 03 13", + datetime.strptime("2010 03 13", custom_date_format[0]), + ], + [ + "2016 03 11", + datetime.strptime("2016 03 11", custom_date_format[0]), + ], + [ + "2018 09 27", + datetime.strptime("2018 09 27", custom_date_format[0]), + ], + ] + ) + + ordered_data = ordered_data[:, 0] + output_data = dg.get_ordered_column(data, "datetime", "ascending") + + np.testing.assert_array_equal(output_data, ordered_data) + + def test_get_ordered_column_custom_datetime_descending(self): + custom_date_format = ["%Y %m %d"] + data = dategen.random_datetimes( + rng=self.rng, + date_format_list=custom_date_format, + start_date=self.start_date, + end_date=self.end_date, + num_rows=5, + ) + + ordered_data = np.array( + [ + [ + "2018 09 27", + datetime.strptime("2018 09 27", custom_date_format[0]), + ], + [ + "2016 03 11", + datetime.strptime("2016 03 11", custom_date_format[0]), + ], + [ + "2010 03 13", + datetime.strptime("2010 03 13", custom_date_format[0]), + ], + [ + "2008 08 19", + datetime.strptime("2008 08 19", custom_date_format[0]), + ], + [ + "2006 10 02", + datetime.strptime("2006 10 02", custom_date_format[0]), + ], + ] + ) + + ordered_data = ordered_data[:, 0] + output_data = dg.get_ordered_column(data, "datetime", "descending") + + np.testing.assert_array_equal(output_data, ordered_data) + + def test_get_ordered_column(self): + + data = OrderedDict( + { + "int": np.array([5, 4, 3, 2, 1]), + "float": np.array([5.0, 4.0, 3.0, 2.0, 1.0]), + "string": np.array(["abcde", "bcdea", "cdeab", "deabc", "eabcd"]), + "categorical": np.array(["E", "D", "C", "B", "A"]), + "datetime": np.array( + [ + [ + "September 27 2018 18:24:03", + datetime.strptime( + "September 27 2018 18:24:03", self.date_format_list[0] + ), + ], + [ + "March 11 2016 15:15:39", + datetime.strptime( + "March 11 2016 15:15:39", self.date_format_list[0] + ), + ], + [ + "March 13 2010 17:18:44", + datetime.strptime( + "March 13 2010 17:18:44", self.date_format_list[0] + ), + ], + [ + "August 19 2008 16:53:49", + datetime.strptime( + "August 19 2008 16:53:49", self.date_format_list[0] + ), + ], + [ + "October 02 2006 22:34:32", + datetime.strptime( + "October 02 2006 22:34:32", self.date_format_list[0] + ), + ], + ] + ), + } + ) + + ordered_data = [ + np.array([1, 2, 3, 4, 5]), + np.array([1.0, 2.0, 3.0, 4.0, 5.0]), + np.array(["abcde", "bcdea", "cdeab", "deabc", "eabcd"]), + np.array(["A", "B", "C", "D", "E"]), + np.array( + [ + "October 02 2006 22:34:32", + "August 19 2008 16:53:49", + "March 13 2010 17:18:44", + "March 11 2016 15:15:39", + "September 27 2018 18:24:03", + ] + ), + ] + ordered_data = np.array(ordered_data, dtype=object) + + output_data = [] + for data_type in data.keys(): + output_data.append(dg.get_ordered_column(data[data_type], data_type)) + output_data = np.array(output_data) + + np.testing.assert_array_equal(output_data, ordered_data)