Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

get_ordered_columns and datetime_generator implementation + tests #313

Closed
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
807ea02
added order generator
drahc1R Jul 17, 2023
fdcf8f5
fixed flak8 errors
drahc1R Jul 17, 2023
50979b5
fixed sort, refactored columns_to_generate, and added tests
drahc1R Jul 19, 2023
7aba6c9
fixed pre-commit
drahc1R Jul 19, 2023
4b22d2c
added more tests to order_generator
drahc1R Jul 19, 2023
09cdc87
fixed pre-commit
drahc1R Jul 19, 2023
83b1601
remove comments
drahc1R Jul 19, 2023
6a7c967
empty
drahc1R Jul 19, 2023
ab9ec23
adding extrawhitespace
drahc1R Jul 19, 2023
668a268
reverted whitespace
drahc1R Jul 19, 2023
9a85a60
whitespace
drahc1R Jul 19, 2023
5ec6f25
removed for loop
drahc1R Jul 20, 2023
c6d448b
pre-commit
drahc1R Jul 20, 2023
c98a26d
removed for loops and just made an assert
drahc1R Jul 20, 2023
9835add
removed extra np.asarray
drahc1R Jul 20, 2023
41c2e59
refactored dataset_generator
drahc1R Jul 20, 2023
a24dbbb
ahh lmao"
drahc1R Jul 20, 2023
e684cb6
refactored generate_dataset_by_class
drahc1R Jul 21, 2023
8f7ed4b
comments
drahc1R Jul 21, 2023
d267204
refactored dataset_generator.get_ordered_column
drahc1R Jul 24, 2023
c5a331f
refactored datatime_generator
drahc1R Jul 24, 2023
f3a16a3
fixes to datetime_generator and refactored datetime_generator tests
drahc1R Jul 24, 2023
771e5e6
pre-commit fixes
drahc1R Jul 24, 2023
066b357
fixed missing List import
drahc1R Jul 24, 2023
a57a65b
pre-commits
drahc1R Jul 24, 2023
48d7f69
fixed get_ordered and datetime_gen
drahc1R Jul 25, 2023
46500ba
empty commit
drahc1R Jul 25, 2023
3feb802
added deepcopy to col
drahc1R Jul 25, 2023
47b0f26
pre-commits
drahc1R Jul 25, 2023
e9612b8
added test for get_ordered_datetime_ascending
drahc1R Jul 25, 2023
189a969
added custom datetime_get_ordered tests and descending tests
drahc1R Jul 25, 2023
4cfc611
updated test_dataset_generator asserts and get_col_ordered_column test
drahc1R Jul 26, 2023
3748378
added tests to check output for functions of datetime_generator
drahc1R Jul 26, 2023
4f7b891
test
drahc1R Jul 26, 2023
d510505
test
drahc1R Jul 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions synthetic_data/dataset_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""Contains generator that returns collective df of requested distinct generators."""

from typing import List, Optional

import numpy as np
import pandas as pd
from numpy.random import Generator

from synthetic_data.distinct_generators.categorical_generator import random_categorical
from synthetic_data.distinct_generators.datetime_generator import random_datetimes
from synthetic_data.distinct_generators.float_generator import random_floats
from synthetic_data.distinct_generators.int_generator import random_integers
from synthetic_data.distinct_generators.text_generator import random_string, random_text


def convert_data_to_df(
np_data: np.array,
path: Optional[str] = None,
index: bool = False,
column_names: Optional[List[str]] = None,
) -> pd.DataFrame:
"""
Convert np array to a pandas dataframe.

:param np_data: np array to be converted
:type np_data: numpy array
:param path: path to output a csv of the dataframe generated
:type path: str, None, optional
:param index: whether to include index in output to csv
:type path: bool, optional
:param column_names: The names of the columns of a dataset
:type path: List, None, optional
:return: a pandas dataframe
"""
# convert array into dataframe
if not column_names:
column_names = [x for x in range(len(np_data))]
dataframe = pd.DataFrame.from_dict(dict(zip(column_names, np_data)))

# save the dataframe as a csv file
drahc1R marked this conversation as resolved.
Show resolved Hide resolved
if path:
dataframe.to_csv(path, index=index, encoding="utf-8")
return dataframe


def get_ordered_column(
data: np.array,
data_type: str,
order: str = "ascending",
) -> np.array:
"""Sort a numpy array based on data type.

:param data: numpy array to be sorted
:type data: np.array

:return: sorted numpy array
"""
if data_type == "datetime":
sorted_data = sorted(data, key=lambda x: x[1])
sorted_data = np.array([dt[1].strftime(dt[0]) for dt in sorted_data])
else:
sorted_data = np.sort(data)

if order == "descending":
return sorted_data[::-1]
return sorted_data


def generate_dataset_by_class(
rng: Generator,
columns_to_generate: Optional[List[dict]] = None,
dataset_length: int = 100000,
path: Optional[str] = None,
) -> pd.DataFrame:
"""Randomly generate a dataset with a mixture of different data classes.

:param rng: the np rng object used to generate random values
:type rng: numpy Generator
:param columns_to_generate: Classes of data to be included in the dataset
:type columns_to_generate: List[dict], None, optional
:param dataset_length: length of the dataset generated
:type dataset_length: int, optional
:param path: path to output a csv of the dataframe generated
:type path: str, None, optional
:param ordered: whether to generate ordered data
:type ordered: bool, optional

:return: pandas DataFrame
"""
gen_funcs = {
"integer": random_integers,
"float": random_floats,
"categorical": random_categorical,
"text": random_text,
"datetime": random_datetimes,
"string": random_string,
}

dataset = []
for col in columns_to_generate:
data_type_var = col.get("data_type", None)
if data_type_var not in gen_funcs:
drahc1R marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(f"generator: {data_type_var} is not a valid generator.")

col_generator_function = gen_funcs.get(data_type_var)
generated_data = col_generator_function(**col, num_rows=dataset_length, rng=rng)
sort = col.get("ordered", None)

if sort in ["ascending", "descending"]:
drahc1R marked this conversation as resolved.
Show resolved Hide resolved
dataset.append(
get_ordered_column(
generated_data,
data_type_var,
sort,
)
)
else:
dataset.append(generated_data)
return convert_data_to_df(dataset, path)
17 changes: 10 additions & 7 deletions synthetic_data/distinct_generators/datetime_generator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Contains a datetime generator."""
from typing import Optional
import random
from datetime import datetime
from typing import List, Optional

import numpy as np
import pandas as pd
Expand All @@ -11,7 +13,7 @@ def generate_datetime(
date_format: str,
start_date: pd.Timestamp = pd.Timestamp(1920, 1, 1),
end_date: pd.Timestamp = pd.Timestamp(2049, 12, 31),
) -> str:
) -> list:
"""
Generate datetime given the random_state, date_format, and start/end dates.

Expand All @@ -32,13 +34,14 @@ def generate_datetime(
"""
t = rng.random()
ptime = start_date + t * (end_date - start_date)
ptime = ptime.strftime(date_format)

return ptime.strftime(date_format)
return [date_format, datetime.strptime(ptime, date_format)]

drahc1R marked this conversation as resolved.
Show resolved Hide resolved

def random_datetimes(
rng: Generator,
date_format_list: Optional[str] = None,
date_format_list: Optional[List[str]] = None,
start_date: pd.Timestamp = None,
end_date: pd.Timestamp = None,
num_rows: int = 1,
Expand All @@ -48,9 +51,9 @@ def random_datetimes(

:param rng: the np rng object used to generate random values
:type rng: numpy Generator
:param date_format: the format that the generated datatime will follow,
:param date_format_list: the format that the generated datatime will follow,
defaults to None
:type date_format: str, None, optional
:type date_format: List, None, optional
:param start_date: the earliest date that datetimes can be generated at,
defaults to pd.Timestamp(1920, 1, 1)
:type start_date: pd.Timestamp, None, optional
Expand All @@ -66,7 +69,7 @@ def random_datetimes(
date_format_list = ["%B %d %Y %H:%M:%S"]

for i in range(num_rows):
date_format = rng.choice(date_format_list)
date_format = random.choice(date_format_list)
datetime = generate_datetime(
rng, date_format=date_format, start_date=start_date, end_date=end_date
)
Expand Down
40 changes: 22 additions & 18 deletions tests/distinct_generators/test_datetime_generator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
from datetime import datetime

import numpy as np
import pandas as pd
Expand All @@ -12,57 +13,60 @@ def setUp(self):
self.rng = Generator(PCG64(12345))
self.start_date = pd.Timestamp(2001, 12, 22)
self.end_date = pd.Timestamp(2023, 1, 1)
self.date_format_list = ["%Y-%m-%d", "%d-%m-%Y"]
self.date_format_list = ["%Y %m %d"]

def test_generate_datetime_return_type(self):
date_str = date_generator.generate_datetime(
date = date_generator.generate_datetime(
self.rng, self.date_format_list[0], self.start_date, self.end_date
)
self.assertIsInstance(date_str, str)
self.assertIsInstance(date, list)
self.assertIsInstance(date[0], str)
self.assertIsInstance(date[1], datetime)

def test_generate_datetime_format(self):
date_str = date_generator.generate_datetime(
date = date_generator.generate_datetime(
self.rng, self.date_format_list[0], self.start_date, self.end_date
)
try:
pd.to_datetime(date_str, format=self.date_format_list[0])
pd.to_datetime(date[1], format=date[0])
except ValueError:
self.fail("pd.to_datetime() raised ValueError unexpectedly")

def test_generate_datetime_range(self):
date_str = date_generator.generate_datetime(
date = date_generator.generate_datetime(
self.rng, self.date_format_list[0], self.start_date, self.end_date
)
date_obj = pd.to_datetime(date_str, format=self.date_format_list[0])
date_obj = pd.to_datetime(date[1], format=date[0])
self.assertTrue(self.start_date <= date_obj)
self.assertTrue(date_obj <= self.end_date)

def test_random_datetimes_return_type_and_size(self):
result = date_generator.random_datetimes(
date = date_generator.random_datetimes(
self.rng, self.date_format_list, self.start_date, self.end_date, 5
)
self.assertIsInstance(result, np.ndarray)
self.assertEqual(result.shape[0], 5)
self.assertIsInstance(date, np.ndarray)
self.assertEqual(date.shape[0], 5)

def test_random_datetimes_default_format_usage(self):
result = date_generator.random_datetimes(
dates = date_generator.random_datetimes(
self.rng, None, self.start_date, self.end_date, 10
)
for date_str in result:
for date in dates:
try:
pd.to_datetime(date_str, format="%B %d %Y %H:%M:%S")
pd.to_datetime(date[1], format=date[0])
except ValueError:
self.fail("pd.to_datetime() raised ValueError unexpectedly")

def test_random_datetimes_format_usage(self):
result = date_generator.random_datetimes(
date_formats = ["%Y-%m-%d", "%B %d %Y %H:%M:%S"]
dates = date_generator.random_datetimes(
self.rng, self.date_format_list, self.start_date, self.end_date, 10
)
format_success = [False] * len(self.date_format_list)
for date_str in result:
for i, date_format in enumerate(self.date_format_list):
format_success = [False] * len(date_formats)
for date in dates:
for i in range(len(date_formats)):
try:
pd.to_datetime(date_str, format=date_format)
pd.to_datetime(date[1], format=date[0])
format_success[i] = True
except ValueError:
pass
Expand Down
Loading