diff --git a/dask_ml/datasets.py b/dask_ml/datasets.py index a561ee0d5..e9375295b 100644 --- a/dask_ml/datasets.py +++ b/dask_ml/datasets.py @@ -381,10 +381,11 @@ def make_classification( return X, y -def random_date(start, end): +def random_date(start, end, random_state=None): + rng_random_date = sklearn.utils.check_random_state(random_state) delta = end - start int_delta = (delta.days * 24 * 60 * 60) + delta.seconds - random_second = np.random.randint(int_delta) + random_second = rng_random_date.randint(int_delta) return start + timedelta(seconds=random_second) @@ -430,6 +431,13 @@ def make_classification_df( The output values. """ + if ( + random_state is not None + or not isinstance(random_state, np.random.RandomState) + or not isinstance(random_state, int) + ): + random_state = 42 + X_array, y_array = make_classification( n_samples=n_samples, flip_y=(1 - predictability), @@ -451,8 +459,13 @@ def make_classification_df( [ X_df, dd.from_array( - np.array([random_date(*dates)] * len(X_df)), - chunksize=chunks, + np.array( + [ + random_date(*dates, random_state + i) + for i in range(len(X_df)) + ] + ), + chunksize=n_samples, columns=["date"], ), ], diff --git a/tests/test_datasets.py b/tests/test_datasets.py index d221e2963..8a0b5c959 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -73,6 +73,15 @@ def test_make_classification_df(): dates=(date(2014, 1, 1), date(2015, 1, 1)), ) + X_df1, y_series1 = dask_ml.datasets.make_classification_df( + n_samples=100, + n_features=5, + random_state=123, + chunks=100, + dates=(date(2014, 1, 1), date(2015, 1, 1)), + ) + check_randomness = np.unique((X_df["date"] == X_df1["date"]).compute()) + assert X_df is not None assert y_series is not None assert "date" in X_df.columns @@ -80,3 +89,6 @@ def test_make_classification_df(): assert len(X_df) == 100 assert len(y_series) == 100 assert isinstance(y_series, dask.dataframe.core.Series) + assert check_randomness.size == 1 + assert check_randomness[0] is True + assert np.unique(X_df["date"]).size >= 2