From 4b55f9a8defb3a083e9fd3bf4a0a297734590ad7 Mon Sep 17 00:00:00 2001 From: Bhuvanashree M Date: Sun, 29 Aug 2021 18:33:28 +0530 Subject: [PATCH 1/5] fix populates make_classification_df with random dates --- dask_ml/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dask_ml/datasets.py b/dask_ml/datasets.py index a561ee0d5..ddce38651 100644 --- a/dask_ml/datasets.py +++ b/dask_ml/datasets.py @@ -451,8 +451,8 @@ def make_classification_df( [ X_df, dd.from_array( - np.array([random_date(*dates)] * len(X_df)), - chunksize=chunks, + np.array([random_date(*dates) for i in range(len(X_df))]), + chunksize=n_samples, columns=["date"], ), ], From 424fca4924e3aaab0fa070c10eb9800f7b885ae4 Mon Sep 17 00:00:00 2001 From: Bhuvanashree M Date: Tue, 31 Aug 2021 16:38:21 +0530 Subject: [PATCH 2/5] added-seed-to-random_date-and-modified-test_datasets --- dask_ml/datasets.py | 12 +++++++++--- tests/test_datasets.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/dask_ml/datasets.py b/dask_ml/datasets.py index ddce38651..f41713e99 100644 --- a/dask_ml/datasets.py +++ b/dask_ml/datasets.py @@ -381,10 +381,11 @@ def make_classification( return X, y -def random_date(start, end): +def random_date(start, end, random_state=None): + rng_random_date = dask_ml.utils.check_random_state(random_state) delta = end - start int_delta = (delta.days * 24 * 60 * 60) + delta.seconds - random_second = np.random.randint(int_delta) + random_second = rng_random_date.randint(int_delta).compute().item() return start + timedelta(seconds=random_second) @@ -451,7 +452,12 @@ def make_classification_df( [ X_df, dd.from_array( - np.array([random_date(*dates) for i in range(len(X_df))]), + np.array( + [ + random_date(*dates, random_state + i) + for i in range(len(X_df)) + ] + ), chunksize=n_samples, columns=["date"], ), diff --git a/tests/test_datasets.py b/tests/test_datasets.py index d221e2963..775de7ece 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -73,6 +73,15 @@ def test_make_classification_df(): dates=(date(2014, 1, 1), date(2015, 1, 1)), ) + X_df1, y_series1 = dask_ml.datasets.make_classification_df( + n_samples=100, + n_features=5, + random_state=123, + chunks=100, + dates=(date(2014, 1, 1), date(2015, 1, 1)), + ) + check_randomness = np.unique((X_df["date"] == X_df1["date"]).compute()) + assert X_df is not None assert y_series is not None assert "date" in X_df.columns @@ -80,3 +89,5 @@ def test_make_classification_df(): assert len(X_df) == 100 assert len(y_series) == 100 assert isinstance(y_series, dask.dataframe.core.Series) + assert check_randomness.size == 1 + assert check_randomness[0] is True From b740b27e88ad5410843a31d52961e5b74bff8a15 Mon Sep 17 00:00:00 2001 From: Bhuvanashree M Date: Fri, 3 Sep 2021 08:49:42 +0530 Subject: [PATCH 3/5] check-for-unique-values --- tests/test_datasets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 775de7ece..8a0b5c959 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -91,3 +91,4 @@ def test_make_classification_df(): assert isinstance(y_series, dask.dataframe.core.Series) assert check_randomness.size == 1 assert check_randomness[0] is True + assert np.unique(X_df["date"]).size >= 2 From 24461fbf6e47d01ef69004058175e63bca09db02 Mon Sep 17 00:00:00 2001 From: Bhuvanashree M Date: Fri, 3 Sep 2021 12:41:02 +0530 Subject: [PATCH 4/5] checks-for-random_state-type --- dask_ml/datasets.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dask_ml/datasets.py b/dask_ml/datasets.py index f41713e99..2d6de11a5 100644 --- a/dask_ml/datasets.py +++ b/dask_ml/datasets.py @@ -431,6 +431,13 @@ def make_classification_df( The output values. """ + if ( + random_state is not None + or not isinstance(random_state, np.random.RandomState) + or not isinstance(random_state, int) + ): + random_state = None + X_array, y_array = make_classification( n_samples=n_samples, flip_y=(1 - predictability), From 3a7c9d190f2153b4eb477e6d5c839a6211efd969 Mon Sep 17 00:00:00 2001 From: Bhuvanashree M Date: Sat, 4 Sep 2021 15:05:37 +0530 Subject: [PATCH 5/5] removed-redundant-compute-calls-in-random_date --- dask_ml/datasets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dask_ml/datasets.py b/dask_ml/datasets.py index 2d6de11a5..e9375295b 100644 --- a/dask_ml/datasets.py +++ b/dask_ml/datasets.py @@ -382,10 +382,10 @@ def make_classification( def random_date(start, end, random_state=None): - rng_random_date = dask_ml.utils.check_random_state(random_state) + rng_random_date = sklearn.utils.check_random_state(random_state) delta = end - start int_delta = (delta.days * 24 * 60 * 60) + delta.seconds - random_second = rng_random_date.randint(int_delta).compute().item() + random_second = rng_random_date.randint(int_delta) return start + timedelta(seconds=random_second) @@ -436,7 +436,7 @@ def make_classification_df( or not isinstance(random_state, np.random.RandomState) or not isinstance(random_state, int) ): - random_state = None + random_state = 42 X_array, y_array = make_classification( n_samples=n_samples,