From 05835a5200a3d4bfe6577d00338c5a27ec32312c Mon Sep 17 00:00:00 2001
From: rakow <rakow@vsp.tu-berlin.de>
Date: Mon, 15 Jul 2024 12:56:57 +0200
Subject: [PATCH] clip y_pred before log_loss

---
 matsim/calibration/run_simulations.py | 16 ++---
 matsim/calibration/utils.py           | 88 +--------------------------
 2 files changed, 9 insertions(+), 95 deletions(-)

diff --git a/matsim/calibration/run_simulations.py b/matsim/calibration/run_simulations.py
index a698a07..2d51a94 100644
--- a/matsim/calibration/run_simulations.py
+++ b/matsim/calibration/run_simulations.py
@@ -42,8 +42,7 @@ def sample_y_null(shares: np.array, num_persons: int, num_samples: int):
 
 def process_results(runs):
     """Process results of multiple simulations"""
-    from .utils import log_loss
-    from sklearn.metrics import accuracy_score
+    from sklearn.metrics import log_loss, accuracy_score
     from sklearn.preprocessing import LabelEncoder
 
     print("Processing results in %s" % runs)
@@ -99,14 +98,15 @@ def process_results(runs):
 
     # Compute likelihood with eps as 0.01%
     eps = 0.0001
+    y_pred = np.clip(y_pred, eps, 1 - eps)
 
     result = [
-        ("Log likelihood", -log_loss(y_true, y_pred, sample_weight=dfs.weight, eps=eps, normalize=False),
-         -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, eps=eps, normalize=False)),
-        ("Log likelihood (normalized)", -log_loss(y_true, y_pred, sample_weight=dfs.weight, eps=eps, normalize=True),
-         -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, eps=eps, normalize=True)),
-        ("Log likelihood (null)", -log_loss(y_true, y_null, sample_weight=dfs.weight, eps=eps, normalize=False),
-         -log_loss(y_true, y_null, sample_weight=dfs.weight * dists, eps=eps, normalize=False)),
+        ("Log likelihood", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=False),
+         -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=False)),
+        ("Log likelihood (normalized)", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=True),
+         -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=True)),
+        ("Log likelihood (null)", -log_loss(y_true, y_null, sample_weight=dfs.weight, normalize=False),
+         -log_loss(y_true, y_null, sample_weight=dfs.weight * dists, normalize=False)),
         ("Mean Accuracy", np.mean(accs), np.mean(accs_d)),
         ("Samples", len(dfs), sum(dists)),
         ("Runs", len(pred_cols), len(pred_cols))
diff --git a/matsim/calibration/utils.py b/matsim/calibration/utils.py
index 2d7cbab..ecb353e 100644
--- a/matsim/calibration/utils.py
+++ b/matsim/calibration/utils.py
@@ -5,13 +5,6 @@
 
 from scipy.special import xlogy
 from sklearn.preprocessing import LabelBinarizer, LabelEncoder
-from sklearn.utils import (
-    assert_all_finite,
-    check_array,
-    check_consistent_length,
-    column_or_1d,
-)
-from sklearn.metrics._classification import _weighted_sum
 
 from optuna.trial import TrialState
 
@@ -97,83 +90,4 @@ def _f(jvm_args, jar, config, params_path, run_dir, trial_number, run_args):
             jvm_args, jar, config, run_dir, trial_number, yaml_arg, params_path, run_args
         )
 
-    return _f
-
-
-def log_loss(y_true, y_pred, *, eps="auto", normalize=True, sample_weight=None, labels=None):
-    """Log loss, aka logistic loss or cross-entropy loss. Taken from scikit-learn 1.3."""
-    y_pred = check_array(
-        y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16]
-    )
-    if eps == "auto":
-        eps = np.finfo(y_pred.dtype).eps
-
-    check_consistent_length(y_pred, y_true, sample_weight)
-    lb = LabelBinarizer()
-
-    if labels is not None:
-        lb.fit(labels)
-    else:
-        lb.fit(y_true)
-
-    if len(lb.classes_) == 1:
-        if labels is None:
-            raise ValueError(
-                "y_true contains only one label ({0}). Please "
-                "provide the true labels explicitly through the "
-                "labels argument.".format(lb.classes_[0])
-            )
-        else:
-            raise ValueError(
-                "The labels array needs to contain at least two "
-                "labels for log_loss, "
-                "got {0}.".format(lb.classes_)
-            )
-
-    transformed_labels = lb.transform(y_true)
-
-    if transformed_labels.shape[1] == 1:
-        transformed_labels = np.append(
-            1 - transformed_labels, transformed_labels, axis=1
-        )
-
-    # Clipping
-    y_pred = np.clip(y_pred, eps, 1 - eps)
-
-    # If y_pred is of single dimension, assume y_true to be binary
-    # and then check.
-    if y_pred.ndim == 1:
-        y_pred = y_pred[:, np.newaxis]
-    if y_pred.shape[1] == 1:
-        y_pred = np.append(1 - y_pred, y_pred, axis=1)
-
-    # Check if dimensions are consistent.
-    transformed_labels = check_array(transformed_labels)
-    if len(lb.classes_) != y_pred.shape[1]:
-        if labels is None:
-            raise ValueError(
-                "y_true and y_pred contain different number of "
-                "classes {0}, {1}. Please provide the true "
-                "labels explicitly through the labels argument. "
-                "Classes found in "
-                "y_true: {2}".format(
-                    transformed_labels.shape[1], y_pred.shape[1], lb.classes_
-                )
-            )
-        else:
-            raise ValueError(
-                "The number of classes in labels is different "
-                "from that in y_pred. Classes found in "
-                "labels: {0}".format(lb.classes_)
-            )
-
-    # Renormalize
-    y_pred_sum = y_pred.sum(axis=1)
-    if not np.isclose(y_pred_sum, 1, rtol=1e-15, atol=5 * eps).all():
-        raise ValueError(
-            "y_pred contains values not summing to 1."
-        )
-    y_pred = y_pred / y_pred_sum[:, np.newaxis]
-    loss = -xlogy(transformed_labels, y_pred).sum(axis=1)
-
-    return _weighted_sum(loss, sample_weight, normalize)
\ No newline at end of file
+    return _f
\ No newline at end of file