From 05835a5200a3d4bfe6577d00338c5a27ec32312c Mon Sep 17 00:00:00 2001 From: rakow Date: Mon, 15 Jul 2024 12:56:57 +0200 Subject: [PATCH] clip y_pred before log_loss --- matsim/calibration/run_simulations.py | 16 ++--- matsim/calibration/utils.py | 88 +-------------------------- 2 files changed, 9 insertions(+), 95 deletions(-) diff --git a/matsim/calibration/run_simulations.py b/matsim/calibration/run_simulations.py index a698a07..2d51a94 100644 --- a/matsim/calibration/run_simulations.py +++ b/matsim/calibration/run_simulations.py @@ -42,8 +42,7 @@ def sample_y_null(shares: np.array, num_persons: int, num_samples: int): def process_results(runs): """Process results of multiple simulations""" - from .utils import log_loss - from sklearn.metrics import accuracy_score + from sklearn.metrics import log_loss, accuracy_score from sklearn.preprocessing import LabelEncoder print("Processing results in %s" % runs) @@ -99,14 +98,15 @@ def process_results(runs): # Compute likelihood with eps as 0.01% eps = 0.0001 + y_pred = np.clip(y_pred, eps, 1 - eps) result = [ - ("Log likelihood", -log_loss(y_true, y_pred, sample_weight=dfs.weight, eps=eps, normalize=False), - -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, eps=eps, normalize=False)), - ("Log likelihood (normalized)", -log_loss(y_true, y_pred, sample_weight=dfs.weight, eps=eps, normalize=True), - -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, eps=eps, normalize=True)), - ("Log likelihood (null)", -log_loss(y_true, y_null, sample_weight=dfs.weight, eps=eps, normalize=False), - -log_loss(y_true, y_null, sample_weight=dfs.weight * dists, eps=eps, normalize=False)), + ("Log likelihood", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=False), + -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=False)), + ("Log likelihood (normalized)", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=True), + -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=True)), + ("Log likelihood (null)", -log_loss(y_true, y_null, sample_weight=dfs.weight, normalize=False), + -log_loss(y_true, y_null, sample_weight=dfs.weight * dists, normalize=False)), ("Mean Accuracy", np.mean(accs), np.mean(accs_d)), ("Samples", len(dfs), sum(dists)), ("Runs", len(pred_cols), len(pred_cols)) diff --git a/matsim/calibration/utils.py b/matsim/calibration/utils.py index 2d7cbab..ecb353e 100644 --- a/matsim/calibration/utils.py +++ b/matsim/calibration/utils.py @@ -5,13 +5,6 @@ from scipy.special import xlogy from sklearn.preprocessing import LabelBinarizer, LabelEncoder -from sklearn.utils import ( - assert_all_finite, - check_array, - check_consistent_length, - column_or_1d, -) -from sklearn.metrics._classification import _weighted_sum from optuna.trial import TrialState @@ -97,83 +90,4 @@ def _f(jvm_args, jar, config, params_path, run_dir, trial_number, run_args): jvm_args, jar, config, run_dir, trial_number, yaml_arg, params_path, run_args ) - return _f - - -def log_loss(y_true, y_pred, *, eps="auto", normalize=True, sample_weight=None, labels=None): - """Log loss, aka logistic loss or cross-entropy loss. Taken from scikit-learn 1.3.""" - y_pred = check_array( - y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16] - ) - if eps == "auto": - eps = np.finfo(y_pred.dtype).eps - - check_consistent_length(y_pred, y_true, sample_weight) - lb = LabelBinarizer() - - if labels is not None: - lb.fit(labels) - else: - lb.fit(y_true) - - if len(lb.classes_) == 1: - if labels is None: - raise ValueError( - "y_true contains only one label ({0}). Please " - "provide the true labels explicitly through the " - "labels argument.".format(lb.classes_[0]) - ) - else: - raise ValueError( - "The labels array needs to contain at least two " - "labels for log_loss, " - "got {0}.".format(lb.classes_) - ) - - transformed_labels = lb.transform(y_true) - - if transformed_labels.shape[1] == 1: - transformed_labels = np.append( - 1 - transformed_labels, transformed_labels, axis=1 - ) - - # Clipping - y_pred = np.clip(y_pred, eps, 1 - eps) - - # If y_pred is of single dimension, assume y_true to be binary - # and then check. - if y_pred.ndim == 1: - y_pred = y_pred[:, np.newaxis] - if y_pred.shape[1] == 1: - y_pred = np.append(1 - y_pred, y_pred, axis=1) - - # Check if dimensions are consistent. - transformed_labels = check_array(transformed_labels) - if len(lb.classes_) != y_pred.shape[1]: - if labels is None: - raise ValueError( - "y_true and y_pred contain different number of " - "classes {0}, {1}. Please provide the true " - "labels explicitly through the labels argument. " - "Classes found in " - "y_true: {2}".format( - transformed_labels.shape[1], y_pred.shape[1], lb.classes_ - ) - ) - else: - raise ValueError( - "The number of classes in labels is different " - "from that in y_pred. Classes found in " - "labels: {0}".format(lb.classes_) - ) - - # Renormalize - y_pred_sum = y_pred.sum(axis=1) - if not np.isclose(y_pred_sum, 1, rtol=1e-15, atol=5 * eps).all(): - raise ValueError( - "y_pred contains values not summing to 1." - ) - y_pred = y_pred / y_pred_sum[:, np.newaxis] - loss = -xlogy(transformed_labels, y_pred).sum(axis=1) - - return _weighted_sum(loss, sample_weight, normalize) \ No newline at end of file + return _f \ No newline at end of file