From ab610602304bd369699b0ef77206afdd64276485 Mon Sep 17 00:00:00 2001 From: Adrien Couratier Date: Fri, 6 Sep 2024 17:13:46 +0200 Subject: [PATCH] Detail some dostrings and fix typo in the tuto --- examples/tutorials/plot_tuto_mcar.py | 2 +- qolmat/analysis/holes_characterization.py | 88 +++++++++++++++++------ 2 files changed, 68 insertions(+), 22 deletions(-) diff --git a/examples/tutorials/plot_tuto_mcar.py b/examples/tutorials/plot_tuto_mcar.py index 1dc6e3b..cbd2d2f 100644 --- a/examples/tutorials/plot_tuto_mcar.py +++ b/examples/tutorials/plot_tuto_mcar.py @@ -53,7 +53,7 @@ # We choose to use the classic threshold of 5%. If the test p-value is below this threshold, # we reject the null hypothesis. # This notebook shows how the Little and PKLM tests perform on a simplistic case and their -# limitations. We instanciate a test object with a random state for reproducibility. +# limitations. We instantiate a test object with a random state for reproducibility. little_test_mcar = LittleTest(random_state=rng) pklm_test_mcar = PKLMTest(random_state=rng) diff --git a/qolmat/analysis/holes_characterization.py b/qolmat/analysis/holes_characterization.py index d7a39ce..7e585ee 100644 --- a/qolmat/analysis/holes_characterization.py +++ b/qolmat/analysis/holes_characterization.py @@ -2,13 +2,13 @@ from itertools import combinations from typing import List, Optional, Tuple, Union -from category_encoders.one_hot import OneHotEncoder -from joblib import Parallel, delayed import numpy as np import pandas as pd -from sklearn.ensemble import RandomForestClassifier -from sklearn import utils as sku +from category_encoders.one_hot import OneHotEncoder +from joblib import Parallel, delayed from scipy.stats import chi2 +from sklearn import utils as sku +from sklearn.ensemble import RandomForestClassifier from qolmat.imputations.imputers import ImputerEM from qolmat.utils.input_check import check_pd_df_dtypes @@ -16,20 +16,46 @@ class McarTest(ABC): """ - Astract class for MCAR tests. + Abstract base class for performing MCAR (Missing Completely At Random) tests. Parameters ---------- - random_state : int, optional - The seed of the pseudo random number generator to use, for reproductibility. + random_state : int or np.random.RandomState, optional + Seed or random state for reproducibility. + + Methods + ------- + test(df) + Abstract method to perform the MCAR test on the given DataFrame or NumPy array. """ def __init__(self, random_state: Union[None, int, np.random.RandomState] = None): + """ + Initializes the McarTest class with a random state. + + Parameters + ---------- + random_state : int or np.random.RandomState, optional + Seed or random state for reproducibility. + """ self.rng = sku.check_random_state(random_state) @abstractmethod - def test(self, df: pd.DataFrame) -> float: - pass + def test(self, df: Union[pd.DataFrame, np.ndarray]) -> Union[float, Tuple[float, List[float]]]: + """ + Perform the MCAR test on the input data. + + Parameters + ---------- + df : pd.DataFrame or np.ndarray + Data to be tested for MCAR. Can be provided as a pandas DataFrame or a NumPy array. + + Returns + ------- + float or tuple of float and list of float + Test statistic, or a tuple with the test statistic and additional details if applicable. + """ + raise NotImplemented class LittleTest(McarTest): @@ -67,7 +93,7 @@ def __init__( def test(self, df: pd.DataFrame) -> float: """ - Apply the Little's test over a real dataframe. + Apply the Little's test to a real dataframe. Parameters @@ -99,7 +125,10 @@ def test(self, df: pd.DataFrame) -> float: obs_mean = df_pattern.mean().to_numpy() diff_means = obs_mean - ml_means[list(tup_pattern)] - inv_sigma_pattern = np.linalg.inv(ml_cov[:, tup_pattern][tup_pattern, :]) + inv_sigma_pattern = np.linalg.solve( + ml_cov[:, tup_pattern][tup_pattern, :], + np.eye(len(tup_pattern)) + ) d0 += n_rows_pattern * np.dot( np.dot(diff_means, inv_sigma_pattern), diff_means.T @@ -130,6 +159,9 @@ class PKLMTest(McarTest): ----------- nb_projections : int Number of projections. + nb_projections_threshold : int + If the maximum number of possible permutations is less than this threshold, then all + projections are used. Otherwise, nb_projections random projections are drawn. nb_permutation : int Number of permutations. nb_trees_per_proj : int @@ -167,12 +199,13 @@ def __init__( if self.exact_p_value: self.process_permutation = self._parallel_process_permutation_exact - self.process_permutation = self._parallel_process_permutation + else: + self.process_permutation = self._parallel_process_permutation def _encode_dataframe(self, df: pd.DataFrame) -> np.ndarray: """ Encodes the DataFrame by converting numeric columns to a numpy array - and applying one-hot encoding to non-numeric columns. + and applying one-hot encoding to objects and boolean columns. Parameters: ----------- @@ -250,6 +283,7 @@ def _get_max_draw(p: int) -> int: def _draw_features_and_target_indexes(self, X: np.ndarray) -> Tuple[List[int], int]: """ Randomly selects features and a target from the dataframe. + This corresponds to the Ai and Bi projections of the paper. Parameters: ----------- @@ -263,15 +297,15 @@ def _draw_features_and_target_indexes(self, X: np.ndarray) -> Tuple[List[int], i """ _, p = X.shape nb_features = self.rng.randint(1, p) - features_idx = self.rng.choice(range(p), size=nb_features, replace=False) + features_idx = self.rng.choice(p, size=nb_features, replace=False) target_idx = self.rng.choice(np.setdiff1d(np.arange(p), features_idx)) return features_idx.tolist(), target_idx @staticmethod def _check_draw(X: np.ndarray, features_idx: List[int], target_idx: int) -> np.bool_: """ - Checks if the drawn features and target are valid. - # TODO : Need to develop ? + Checks if the drawn features and target are valid. Here we check that the number of induced + classes is equal to 2. Using the notation from the paper, we want |G(Ai, Bi)| = 2. Parameters: ----------- @@ -294,7 +328,8 @@ def _check_draw(X: np.ndarray, features_idx: List[int], target_idx: int) -> np.b def _generate_label_feature_combinations(self, X: np.ndarray) -> List[Tuple[int, List[int]]]: """ - Generates all valid combinations of features and labels for projection. + Generates all valid combinations of features and labels for projection if + nb_projections_threshold > _get_max_draw(X.shape[1]). Parameters: ----------- @@ -324,6 +359,7 @@ def _generate_label_feature_combinations(self, X: np.ndarray) -> List[Tuple[int, def _draw_projection(self, X: np.ndarray) -> Tuple[List[int], int]: """ Draws a valid projection of features and a target. + If nb_projections_threshold < _get_max_draw(X.shape[1]). Parameters: ----------- @@ -348,8 +384,10 @@ def _build_dataset( target_idx: int ) -> Tuple[np.ndarray, np.ndarray]: """ - Builds a dataset by selecting specified features and target from a NumPy array, - excluding rows with NaN values in the feature columns. + Builds a dataset by selecting specified features and target from a NumPy array, excluding + rows with NaN values in the feature columns. + For the label, we create a binary classification problem where yi =1 if target_idx_i is + missing. Parameters: ----------- @@ -363,7 +401,7 @@ def _build_dataset( Returns: -------- Tuple[np.ndarray, np.ndarray]: A tuple containing: - - X (np.ndarray): Array of selected features. + - X (np.ndarray): Full observed array of selected features. - y (np.ndarray): Binary array indicating presence of NaN (1) in the target column. """ X_features = X[~np.isnan(X[:, features_idx]).any(axis=1)][:, features_idx] @@ -384,6 +422,8 @@ def _build_label( """ Builds a label array by selecting target values from a permutation array, excluding rows with NaN values in the specified feature columns. + For the label, we create a binary classification problem where yi =1 if target_idx_i is + missing. Parameters: ----------- @@ -419,11 +459,11 @@ def _get_oob_probabilities(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: """ clf = RandomForestClassifier( n_estimators=self.nb_trees_per_proj, - # max_features=None, min_samples_split=10, bootstrap=True, oob_score=True, random_state=self.rng, + max_features=1., ) clf.fit(X, y) return clf.oob_decision_function_ @@ -445,6 +485,9 @@ def _U_hat(oob_probabilities: np.ndarray, labels: np.ndarray) -> float: -------- float: The computed U_hat statistic. """ + if oob_probabilities.shape[1] == 1: + return 0. + oob_probabilities = np.clip(oob_probabilities, 1e-9, 1 - 1e-9) unique_labels = np.unique(labels) @@ -502,6 +545,7 @@ def _parallel_process_permutation_exact( ) -> float: X_features, _ = self._build_dataset(X, features_idx, target_idx) y = self._build_label(X, M_perm, features_idx, target_idx) + # In this case, we fit the classifier in each permutation. It takes much more longer. oob_probabilities = self._get_oob_probabilities(X_features, y) return self._U_hat(oob_probabilities, y) @@ -515,6 +559,8 @@ def _parallel_process_projection( X_features, y = self._build_dataset(X, features_idx, target_idx) oob_probabilities = self._get_oob_probabilities(X_features, y) u_hat = self._U_hat(oob_probabilities, y) + # We iterate over the permutation because for a given projection, we fit only one classifier + # to get oob probabilities and compute u_hat nb_permutations times. result_u_permutations = Parallel(n_jobs=-1)( delayed(self.process_permutation)( X, M_perm, features_idx, target_idx, oob_probabilities