Skip to content

Commit

Permalink
Add option to aggregate outputs of c2st
Browse files Browse the repository at this point in the history
  • Loading branch information
stefanradev93 committed Aug 13, 2023
1 parent bb0cf65 commit ae7f8c4
Showing 1 changed file with 30 additions and 16 deletions.
46 changes: 30 additions & 16 deletions bayesflow/computational_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
import tensorflow as tf
from scipy import stats
from sklearn.calibration import calibration_curve
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, KFold

from bayesflow.default_settings import MMD_BANDWIDTH_LIST
from bayesflow.exceptions import ShapeError
Expand Down Expand Up @@ -521,30 +521,41 @@ def aggregated_rmse(x_true, x_pred):
)


def c2st(source_samples, target_samples, n_folds=5, scoring="accuracy", normalize=True, seed=123,
hidden_units_per_dim=10):
"""C2ST metric [1] using an sklearn MLP classifier.
def c2st(
source_samples,
target_samples,
n_folds=5,
scoring="accuracy",
normalize=True,
seed=123,
hidden_units_per_dim=16,
aggregate_output=True,
):
"""C2ST metric [1] using an sklearn neural network classifier (i.e., MLP).
Code adapted from https://github.com/sbi-benchmark/sbibm/blob/main/sbibm/metrics/c2st.py
[1] Lopez-Paz, D., & Oquab, M. (2016). Revisiting classifier two-sample tests. arXiv:1610.06545.
Parameters
----------
source_samples : np.ndarray or tf.Tensor
source_samples : np.ndarray or tf.Tensor
Source samples (e.g., approximate posterior samples)
target_samples : np.ndarray or tf.Tensor
target_samples : np.ndarray or tf.Tensor
Target samples (e.g., samples from a reference posterior)
n_folds : int, optional, default: 5
n_folds : int, optional, default: 5
Number of folds in k-fold cross-validation for the classifier evaluation
scoring : str, optional, default: "accuracy"
scoring : str, optional, default: "accuracy"
Evaluation score of the sklearn MLP classifier
normalize : bool, optional, default: True
normalize : bool, optional, default: True
Whether the data shall be z-standardized relative to source_samples
seed : int, optional, default: 123
seed : int, optional, default: 123
RNG seed for the MLP and k-fold CV
hidden_units_per_dim : int, optional, default: 10
hidden_units_per_dim : int, optional, default: 16
Number of hidden units in the MLP, relative to the input dimensions.
Example: source samples are 5D, hidden_units_per_dim=10 -> 50 hidden units per layer
Example: source samples are 5D, hidden_units_per_dim=16 -> 80 hidden units per layer
aggregate_output : bool, optional, default: True
Whether to return a single value aggregated over all cross-validation runs
or all values from all runs. If left at default, the empirical mean will be returned
Returns
-------
Expand All @@ -558,9 +569,11 @@ def c2st(source_samples, target_samples, n_folds=5, scoring="accuracy", normaliz

num_dims = x.shape[1]
if not num_dims == y.shape[1]:
raise ShapeError(f"source_samples and target_samples can have different number of observations (1st dim)"
f"but must have the same dimensionality (2nd dim)"
f"found: source_samples {source_samples.shape[1]}, target_samples {target_samples.shape[1]}")
raise ShapeError(
f"source_samples and target_samples can have different number of observations (1st dim)"
f"but must have the same dimensionality (2nd dim)"
f"found: source_samples {source_samples.shape[1]}, target_samples {target_samples.shape[1]}"
)

if normalize:
x_mean = np.mean(x, axis=0)
Expand All @@ -587,5 +600,6 @@ def c2st(source_samples, target_samples, n_folds=5, scoring="accuracy", normaliz
shuffle = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
scores = cross_val_score(clf, data, target, cv=shuffle, scoring=scoring)

c2st_score = np.asarray(np.mean(scores)).astype(np.float32)
if aggregate_output:
c2st_score = np.asarray(np.mean(scores)).astype(np.float32)
return c2st_score

0 comments on commit ae7f8c4

Please sign in to comment.