Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset_generator + tests #314

Merged
merged 37 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
669b366
lots of stuff
lizlouise1335 Jul 18, 2023
726bba4
formatting
lizlouise1335 Jul 18, 2023
1bc3377
isort
lizlouise1335 Jul 18, 2023
6d3566f
removing ordered stuff until richard is done
lizlouise1335 Jul 18, 2023
dd2759c
regex
lizlouise1335 Jul 18, 2023
71b272f
Update synthetic_data/dataset_generator.py
lizlouise1335 Jul 18, 2023
d79ce5f
pr requests
lizlouise1335 Jul 18, 2023
359bf69
Merge branch 'data_gen' of https://github.com/lizlouise1335/synthetic…
lizlouise1335 Jul 18, 2023
323bb47
len assert
lizlouise1335 Jul 18, 2023
b9e7c81
new test for path
lizlouise1335 Jul 18, 2023
ba3a7fb
start/end date generator none type test
lizlouise1335 Jul 19, 2023
4fa00b3
start/end date generator none type test
lizlouise1335 Jul 19, 2023
8397161
updates
lizlouise1335 Jul 20, 2023
3243666
doc string
lizlouise1335 Jul 20, 2023
c78e5da
whoops
lizlouise1335 Jul 20, 2023
f813552
no more successful value error
lizlouise1335 Jul 20, 2023
928b769
last commit? plz?
lizlouise1335 Jul 20, 2023
d85efa2
generator parameter order updates plus DS generator update
lizlouise1335 Jul 24, 2023
be44a18
whoops
lizlouise1335 Jul 24, 2023
d2e8c18
error change
lizlouise1335 Jul 24, 2023
e6264b3
datetime test update
lizlouise1335 Jul 24, 2023
e2a4903
revert
lizlouise1335 Jul 24, 2023
fe1ff6c
Empty colums_to_generate triggers warning, tests for **col_
lizlouise1335 Jul 25, 2023
3eae1df
trying to fix github
lizlouise1335 Jul 25, 2023
9ce52d4
ahhh
lizlouise1335 Jul 25, 2023
144609a
update
lizlouise1335 Jul 25, 2023
00b6a65
whoops
lizlouise1335 Jul 25, 2023
61cc611
Name option for columns
lizlouise1335 Jul 25, 2023
ee37fce
WIP tests
lizlouise1335 Jul 25, 2023
b0cf92b
fixed tests
lizlouise1335 Jul 26, 2023
d6ca5b0
Finished tests
lizlouise1335 Jul 26, 2023
794526f
ok now the tests are actually done
lizlouise1335 Jul 26, 2023
4a5f232
test update per taylor's request
lizlouise1335 Jul 26, 2023
356880b
small change to test
lizlouise1335 Jul 26, 2023
1d988f8
last commit? plz?
lizlouise1335 Jul 26, 2023
626939f
last commit? ;__;
lizlouise1335 Jul 26, 2023
b1a283d
ok now it's last one
lizlouise1335 Jul 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions synthetic_data/dataset_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""Contains generator that returns collective df of requested distinct generators."""

import copy
import logging
from typing import List, Optional

import numpy as np
import pandas as pd
from numpy.random import Generator

from synthetic_data.distinct_generators.categorical_generator import random_categorical
from synthetic_data.distinct_generators.datetime_generator import random_datetimes
from synthetic_data.distinct_generators.float_generator import random_floats
from synthetic_data.distinct_generators.int_generator import random_integers
from synthetic_data.distinct_generators.text_generator import random_string, random_text


def convert_data_to_df(
np_data: np.array,
path: Optional[str] = None,
index: bool = False,
column_names: Optional[List[str]] = None,
) -> pd.DataFrame:
"""
Convert np array to a pandas dataframe.

:param np_data: np array to be converted
:type np_data: numpy array
:param path: path to output a csv of the dataframe generated
:type path: str, None, optional
:param index: whether to include index in output to csv
:type path: bool, optional
:param column_names: The names of the columns of a dataset
:type path: List, None, optional
:return: a pandas dataframe
"""
# convert array into dataframe
if not column_names:
column_names = [x for x in range(len(np_data))]
dataframe = pd.DataFrame.from_dict(dict(zip(column_names, np_data)))
# save the dataframe as a csv file
if path:
dataframe.to_csv(path, index=index, encoding="utf-8")
return dataframe


def generate_dataset_by_class(
lizlouise1335 marked this conversation as resolved.
Show resolved Hide resolved
rng: Generator,
columns_to_generate: List[dict],
dataset_length: int = 100000,
path: Optional[str] = None,
) -> pd.DataFrame:
"""
Randomizes a dataset with a mixture of different data classes.

:param rng: the np rng object used to generate random values
:type rng: numpy Generator
:param columns_to_generate: Classes of data to be included in the dataset
:type columns_to_generate: List[dict], None, optional
:param dataset_length: length of the dataset generated, default 100,000
:type dataset_length: int, optional
:param path: path to output a csv of the dataframe generated
:type path: str, None, optional

:return: pandas DataFrame
"""
gen_funcs = {
"integer": random_integers,
"float": random_floats,
"categorical": random_categorical,
"text": random_text,
"datetime": random_datetimes,
"string": random_string,
}

if not columns_to_generate:
logging.warning(
"columns_to_generate is empty, empty dataframe will be returned."
)
return pd.DataFrame()

dataset = []
column_names = []
for col in columns_to_generate:
col_ = copy.deepcopy(col)
col_generator = col_.pop("generator")
if col_generator not in gen_funcs:
raise ValueError(f"generator: {col_generator} is not a valid generator.")
if "name" in col_:
name = col_.pop("name")
else:
name = col_generator
col_generator_function = gen_funcs.get(col_generator)
lizlouise1335 marked this conversation as resolved.
Show resolved Hide resolved
dataset.append(col_generator_function(**col_, rng=rng, num_rows=dataset_length))
column_names.append(name)
return convert_data_to_df(dataset, path, column_names=column_names)
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
def random_categorical(
rng: Generator,
categories: Optional[List[str]] = None,
num_rows: int = 1,
probabilities: Optional[List[float]] = None,
num_rows: int = 1,
) -> np.array:
"""
Randomly generates an array of categorical values chosen out of categories.
Expand Down
8 changes: 6 additions & 2 deletions synthetic_data/distinct_generators/datetime_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
def generate_datetime(
rng: Generator,
date_format: str,
start_date: pd.Timestamp = pd.Timestamp(1920, 1, 1),
lizlouise1335 marked this conversation as resolved.
Show resolved Hide resolved
end_date: pd.Timestamp = pd.Timestamp(2049, 12, 31),
start_date: pd.Timestamp,
end_date: pd.Timestamp,
) -> str:
"""
Generate datetime given the random_state, date_format, and start/end dates.
Expand All @@ -30,6 +30,10 @@ def generate_datetime(
:return: generated datetime
:rtype: str
"""
if start_date is None:
start_date: pd.Timestamp = pd.Timestamp(1920, 1, 1)
if end_date is None:
end_date: pd.Timestamp = pd.Timestamp(2049, 12, 31)
t = rng.random()
ptime = start_date + t * (end_date - start_date)

Expand Down
6 changes: 3 additions & 3 deletions synthetic_data/distinct_generators/text_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
def random_string(
rng: Generator,
chars: Optional[List[str]] = None,
num_rows: int = 1,
str_len_min: int = 1,
str_len_max: int = 256,
num_rows: int = 1,
) -> np.array:
"""
Randomly generates an array of strings with length between the min and max values.
Expand Down Expand Up @@ -50,9 +50,9 @@ def random_string(
def random_text(
rng: Generator,
chars: Optional[str] = None,
num_rows: int = 1,
str_len_min: int = 256,
str_len_max: int = 1000,
num_rows: int = 1,
) -> np.array:
"""
Randomly generates an array of text with length between the min and max values.
Expand All @@ -75,4 +75,4 @@ def random_text(
f"str_len_min must be > 255. " f"Value provided: {str_len_min}."
)

return random_string(rng, chars, num_rows, str_len_min, str_len_max)
return random_string(rng, chars, str_len_min, str_len_max, num_rows)
15 changes: 13 additions & 2 deletions tests/distinct_generators/test_datetime_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@ def setUp(self):
self.end_date = pd.Timestamp(2023, 1, 1)
self.date_format_list = ["%Y-%m-%d", "%d-%m-%Y"]

def test_start_end_date_when_none(self):
date_str = date_generator.generate_datetime(
self.rng, self.date_format_list[0], start_date=None, end_date=None
)
try:
pd.to_datetime(date_str, format=self.date_format_list[0])
except:
self.fail(
"pd.to_datetime() raised ValueError for start_date, end_date = None"
)
Comment on lines +21 to +26
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh hmmm.... I'm not sure this is good....

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

leaving these here for documentation


def test_generate_datetime_return_type(self):
date_str = date_generator.generate_datetime(
self.rng, self.date_format_list[0], self.start_date, self.end_date
Expand All @@ -27,7 +38,7 @@ def test_generate_datetime_format(self):
try:
pd.to_datetime(date_str, format=self.date_format_list[0])
except ValueError:
self.fail("pd.to_datetime() raised ValueError unexpectedly")
self.fail("pd.to_datetime() raised ValueError for custom formatting")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh I see you are matching other tests


def test_generate_datetime_range(self):
date_str = date_generator.generate_datetime(
Expand All @@ -52,7 +63,7 @@ def test_random_datetimes_default_format_usage(self):
try:
pd.to_datetime(date_str, format="%B %d %Y %H:%M:%S")
except ValueError:
self.fail("pd.to_datetime() raised ValueError unexpectedly")
self.fail("pd.to_datetime() raised ValueError for default formatting")

def test_random_datetimes_format_usage(self):
result = date_generator.random_datetimes(
Expand Down
130 changes: 130 additions & 0 deletions tests/test_dataset_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import unittest
from unittest import mock

import pandas as pd
from numpy.random import PCG64, Generator

from synthetic_data.dataset_generator import generate_dataset_by_class


class TestDatasetGenerator(unittest.TestCase):
def setUp(self):
self.rng = Generator(PCG64(12345))
self.dataset_length = 10

def test_generate_dataset_with_invalid_generator(self):
columns_to_gen = [{"generator": "non existent generator"}]
with self.assertRaisesRegex(
ValueError, "generator: non existent generator is not a valid generator."
):
generate_dataset_by_class(
self.rng,
columns_to_generate=columns_to_gen,
dataset_length=self.dataset_length,
path=None,
)

@mock.patch("synthetic_data.dataset_generator.logging.warning")
def test_generate_dataset_with_none_columns(self, mock_warning):
empty_dataframe = pd.DataFrame()
df = generate_dataset_by_class(self.rng, None, self.dataset_length, None)
mock_warning.assert_called_once_with(
"columns_to_generate is empty, empty dataframe will be returned."
)
self.assertEqual(empty_dataframe.empty, df.empty)

def test_generate_custom_dataset(self):
columns_to_gen = [
{"generator": "integer", "name": "int", "min_value": 4, "max_value": 88},
{
"generator": "datetime",
"name": "dat",
lizlouise1335 marked this conversation as resolved.
Show resolved Hide resolved
lizlouise1335 marked this conversation as resolved.
Show resolved Hide resolved
"start_date": pd.Timestamp(2001, 12, 22),
"end_date": pd.Timestamp(2022, 12, 22),
},
{
"generator": "text",
"name": "txt",
"chars": ["0", "1"],
"str_len_min": 300,
"str_len_max": 301,
},
{
"generator": "string",
"name": "str",
"chars": ["0", "1"],
"str_len_min": 2,
"str_len_max": 5,
},
{
"generator": "categorical",
"name": "cat",
"categories": ["X", "Y", "Z"],
"probabilities": [0.1, 0.5, 0.4],
},
{
"generator": "float",
"name": "flo",
"min_value": 3,
"max_value": 10,
"sig_figs": 3,
},
]
lizlouise1335 marked this conversation as resolved.
Show resolved Hide resolved
expected_names = ["int", "dat", "txt", "str", "cat", "flo"]
df = generate_dataset_by_class(
self.rng,
columns_to_generate=columns_to_gen,
dataset_length=self.dataset_length,
path=None,
)
# test column names
self.assertListEqual(list(df.columns), expected_names)
# test ints
min_val = df["int"].min()
max_val = df["int"].max()
self.assertGreaterEqual(min_val, 4)
lizlouise1335 marked this conversation as resolved.
Show resolved Hide resolved
self.assertLessEqual(max_val, 88)
lizlouise1335 marked this conversation as resolved.
Show resolved Hide resolved
# test floats
min_val = df["flo"].min()
max_val = df["flo"].max()
self.assertGreaterEqual(min_val, 3)
self.assertLessEqual(max_val, 10)
# test dates
for date_str in df["dat"]:
date_obj = pd.to_datetime(date_str, format="%B %d %Y %H:%M:%S")
self.assertTrue(
pd.Timestamp(2001, 12, 22) <= date_obj <= pd.Timestamp(2022, 12, 22)
)
# test categorical
self.assertTrue(set(df["cat"]).issubset(["X", "Y", "Z"]))
# test string and text
chars_set = {"0", "1"}
for s in df["str"]:
for char in s:
self.assertIn(char, chars_set)
for s in df["txt"]:
for char in s:
self.assertIn(char, chars_set)

@mock.patch("synthetic_data.dataset_generator.pd.DataFrame.to_csv")
def test_path_to_csv(self, to_csv):
"""
Ensure csv creation is triggered at the appropiate time.

:param to_csv: mock of Pandas to_csv()
:type to_csv: func
"""
columns_to_gen = [
lizlouise1335 marked this conversation as resolved.
Show resolved Hide resolved
{"generator": "integer"},
{"generator": "datetime"},
{"generator": "text"},
]
to_csv.return_value = "assume Pandas to_csv for a dataframe runs correctly"
path = "testing_path"
generate_dataset_by_class(
self.rng,
columns_to_generate=columns_to_gen,
dataset_length=4,
path=path,
)
to_csv.assert_called_once_with(path, index=False, encoding="utf-8")