dask · BhuvanashreeM · Aug 29, 2021 · Aug 31, 2021 · Sep 3, 2021 · Sep 3, 2021
diff --git a/dask_ml/datasets.py b/dask_ml/datasets.py
@@ -381,10 +381,11 @@ def make_classification(
  return X, y
 
 
-def random_date(start, end):
+def random_date(start, end, random_state=None):
+ rng_random_date = dask_ml.utils.check_random_state(random_state)
- rng_random_date = dask_ml.utils.check_random_state(random_state)
+ rng_random_date = sklearn.utils.check_random_state(random_state)
- rng_random_date = dask_ml.utils.check_random_state(random_state)
+ rng_random_date = sklearn.utils.check_random_state(random_state)
  delta = end - start
  int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
- random_second = np.random.randint(int_delta)
+ random_second = rng_random_date.randint(int_delta).compute().item()
  return start + timedelta(seconds=random_second)
 
 
@@ -430,6 +431,13 @@ def make_classification_df(
  The output values.
 
  """
+ if (
+ random_state is not None
+ or not isinstance(random_state, np.random.RandomState)
+ or not isinstance(random_state, int)
+ ):
+ random_state = None
+
  X_array, y_array = make_classification(
  n_samples=n_samples,
  flip_y=(1 - predictability),
@@ -451,8 +459,13 @@ def make_classification_df(
  [
  X_df,
  dd.from_array(
- np.array([random_date(*dates)] * len(X_df)),
- chunksize=chunks,
+ np.array(
+ [
+ random_date(*dates, random_state + i)
+ for i in range(len(X_df))
+ ]
+ ),
+ chunksize=n_samples,
  columns=["date"],
  ),
  ],

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -73,10 +73,22 @@ def test_make_classification_df():
  dates=(date(2014, 1, 1), date(2015, 1, 1)),
  )
 
+ X_df1, y_series1 = dask_ml.datasets.make_classification_df(
+ n_samples=100,
+ n_features=5,
+ random_state=123,
+ chunks=100,
+ dates=(date(2014, 1, 1), date(2015, 1, 1)),
+ )
+ check_randomness = np.unique((X_df["date"] == X_df1["date"]).compute())
+
  assert X_df is not None
  assert y_series is not None
  assert "date" in X_df.columns
  assert len(X_df.columns) == 6
  assert len(X_df) == 100
  assert len(y_series) == 100
  assert isinstance(y_series, dask.dataframe.core.Series)
+ assert check_randomness.size == 1
+ assert check_randomness[0] is True
+ assert np.unique(X_df["date"]).size >= 2