Skip to content

Commit

Permalink
Detail some dostrings and fix typo in the tuto
Browse files Browse the repository at this point in the history
  • Loading branch information
adriencrtrcap committed Sep 6, 2024
1 parent 5b988ec commit ab61060
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 22 deletions.
2 changes: 1 addition & 1 deletion examples/tutorials/plot_tuto_mcar.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
# We choose to use the classic threshold of 5%. If the test p-value is below this threshold,
# we reject the null hypothesis.
# This notebook shows how the Little and PKLM tests perform on a simplistic case and their
# limitations. We instanciate a test object with a random state for reproducibility.
# limitations. We instantiate a test object with a random state for reproducibility.

little_test_mcar = LittleTest(random_state=rng)
pklm_test_mcar = PKLMTest(random_state=rng)
Expand Down
88 changes: 67 additions & 21 deletions qolmat/analysis/holes_characterization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,60 @@
from itertools import combinations
from typing import List, Optional, Tuple, Union

from category_encoders.one_hot import OneHotEncoder
from joblib import Parallel, delayed
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import utils as sku
from category_encoders.one_hot import OneHotEncoder
from joblib import Parallel, delayed
from scipy.stats import chi2
from sklearn import utils as sku
from sklearn.ensemble import RandomForestClassifier

from qolmat.imputations.imputers import ImputerEM
from qolmat.utils.input_check import check_pd_df_dtypes


class McarTest(ABC):
"""
Astract class for MCAR tests.
Abstract base class for performing MCAR (Missing Completely At Random) tests.
Parameters
----------
random_state : int, optional
The seed of the pseudo random number generator to use, for reproductibility.
random_state : int or np.random.RandomState, optional
Seed or random state for reproducibility.
Methods
-------
test(df)
Abstract method to perform the MCAR test on the given DataFrame or NumPy array.
"""

def __init__(self, random_state: Union[None, int, np.random.RandomState] = None):
"""
Initializes the McarTest class with a random state.
Parameters
----------
random_state : int or np.random.RandomState, optional
Seed or random state for reproducibility.
"""
self.rng = sku.check_random_state(random_state)

@abstractmethod
def test(self, df: pd.DataFrame) -> float:
pass
def test(self, df: Union[pd.DataFrame, np.ndarray]) -> Union[float, Tuple[float, List[float]]]:
"""
Perform the MCAR test on the input data.
Parameters
----------
df : pd.DataFrame or np.ndarray
Data to be tested for MCAR. Can be provided as a pandas DataFrame or a NumPy array.
Returns
-------
float or tuple of float and list of float
Test statistic, or a tuple with the test statistic and additional details if applicable.
"""
raise NotImplemented


class LittleTest(McarTest):
Expand Down Expand Up @@ -67,7 +93,7 @@ def __init__(

def test(self, df: pd.DataFrame) -> float:
"""
Apply the Little's test over a real dataframe.
Apply the Little's test to a real dataframe.
Parameters
Expand Down Expand Up @@ -99,7 +125,10 @@ def test(self, df: pd.DataFrame) -> float:
obs_mean = df_pattern.mean().to_numpy()

diff_means = obs_mean - ml_means[list(tup_pattern)]
inv_sigma_pattern = np.linalg.inv(ml_cov[:, tup_pattern][tup_pattern, :])
inv_sigma_pattern = np.linalg.solve(
ml_cov[:, tup_pattern][tup_pattern, :],
np.eye(len(tup_pattern))
)

d0 += n_rows_pattern * np.dot(
np.dot(diff_means, inv_sigma_pattern), diff_means.T
Expand Down Expand Up @@ -130,6 +159,9 @@ class PKLMTest(McarTest):
-----------
nb_projections : int
Number of projections.
nb_projections_threshold : int
If the maximum number of possible permutations is less than this threshold, then all
projections are used. Otherwise, nb_projections random projections are drawn.
nb_permutation : int
Number of permutations.
nb_trees_per_proj : int
Expand Down Expand Up @@ -167,12 +199,13 @@ def __init__(

if self.exact_p_value:
self.process_permutation = self._parallel_process_permutation_exact
self.process_permutation = self._parallel_process_permutation
else:
self.process_permutation = self._parallel_process_permutation

def _encode_dataframe(self, df: pd.DataFrame) -> np.ndarray:
"""
Encodes the DataFrame by converting numeric columns to a numpy array
and applying one-hot encoding to non-numeric columns.
and applying one-hot encoding to objects and boolean columns.
Parameters:
-----------
Expand Down Expand Up @@ -250,6 +283,7 @@ def _get_max_draw(p: int) -> int:
def _draw_features_and_target_indexes(self, X: np.ndarray) -> Tuple[List[int], int]:
"""
Randomly selects features and a target from the dataframe.
This corresponds to the Ai and Bi projections of the paper.
Parameters:
-----------
Expand All @@ -263,15 +297,15 @@ def _draw_features_and_target_indexes(self, X: np.ndarray) -> Tuple[List[int], i
"""
_, p = X.shape
nb_features = self.rng.randint(1, p)
features_idx = self.rng.choice(range(p), size=nb_features, replace=False)
features_idx = self.rng.choice(p, size=nb_features, replace=False)
target_idx = self.rng.choice(np.setdiff1d(np.arange(p), features_idx))
return features_idx.tolist(), target_idx

@staticmethod
def _check_draw(X: np.ndarray, features_idx: List[int], target_idx: int) -> np.bool_:
"""
Checks if the drawn features and target are valid.
# TODO : Need to develop ?
Checks if the drawn features and target are valid. Here we check that the number of induced
classes is equal to 2. Using the notation from the paper, we want |G(Ai, Bi)| = 2.
Parameters:
-----------
Expand All @@ -294,7 +328,8 @@ def _check_draw(X: np.ndarray, features_idx: List[int], target_idx: int) -> np.b

def _generate_label_feature_combinations(self, X: np.ndarray) -> List[Tuple[int, List[int]]]:
"""
Generates all valid combinations of features and labels for projection.
Generates all valid combinations of features and labels for projection if
nb_projections_threshold > _get_max_draw(X.shape[1]).
Parameters:
-----------
Expand Down Expand Up @@ -324,6 +359,7 @@ def _generate_label_feature_combinations(self, X: np.ndarray) -> List[Tuple[int,
def _draw_projection(self, X: np.ndarray) -> Tuple[List[int], int]:
"""
Draws a valid projection of features and a target.
If nb_projections_threshold < _get_max_draw(X.shape[1]).
Parameters:
-----------
Expand All @@ -348,8 +384,10 @@ def _build_dataset(
target_idx: int
) -> Tuple[np.ndarray, np.ndarray]:
"""
Builds a dataset by selecting specified features and target from a NumPy array,
excluding rows with NaN values in the feature columns.
Builds a dataset by selecting specified features and target from a NumPy array, excluding
rows with NaN values in the feature columns.
For the label, we create a binary classification problem where yi =1 if target_idx_i is
missing.
Parameters:
-----------
Expand All @@ -363,7 +401,7 @@ def _build_dataset(
Returns:
--------
Tuple[np.ndarray, np.ndarray]: A tuple containing:
- X (np.ndarray): Array of selected features.
- X (np.ndarray): Full observed array of selected features.
- y (np.ndarray): Binary array indicating presence of NaN (1) in the target column.
"""
X_features = X[~np.isnan(X[:, features_idx]).any(axis=1)][:, features_idx]
Expand All @@ -384,6 +422,8 @@ def _build_label(
"""
Builds a label array by selecting target values from a permutation array,
excluding rows with NaN values in the specified feature columns.
For the label, we create a binary classification problem where yi =1 if target_idx_i is
missing.
Parameters:
-----------
Expand Down Expand Up @@ -419,11 +459,11 @@ def _get_oob_probabilities(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
"""
clf = RandomForestClassifier(
n_estimators=self.nb_trees_per_proj,
# max_features=None,
min_samples_split=10,
bootstrap=True,
oob_score=True,
random_state=self.rng,
max_features=1.,
)
clf.fit(X, y)
return clf.oob_decision_function_
Expand All @@ -445,6 +485,9 @@ def _U_hat(oob_probabilities: np.ndarray, labels: np.ndarray) -> float:
--------
float: The computed U_hat statistic.
"""
if oob_probabilities.shape[1] == 1:
return 0.

oob_probabilities = np.clip(oob_probabilities, 1e-9, 1 - 1e-9)

unique_labels = np.unique(labels)
Expand Down Expand Up @@ -502,6 +545,7 @@ def _parallel_process_permutation_exact(
) -> float:
X_features, _ = self._build_dataset(X, features_idx, target_idx)
y = self._build_label(X, M_perm, features_idx, target_idx)
# In this case, we fit the classifier in each permutation. It takes much more longer.
oob_probabilities = self._get_oob_probabilities(X_features, y)
return self._U_hat(oob_probabilities, y)

Expand All @@ -515,6 +559,8 @@ def _parallel_process_projection(
X_features, y = self._build_dataset(X, features_idx, target_idx)
oob_probabilities = self._get_oob_probabilities(X_features, y)
u_hat = self._U_hat(oob_probabilities, y)
# We iterate over the permutation because for a given projection, we fit only one classifier
# to get oob probabilities and compute u_hat nb_permutations times.
result_u_permutations = Parallel(n_jobs=-1)(
delayed(self.process_permutation)(
X, M_perm, features_idx, target_idx, oob_probabilities
Expand Down

0 comments on commit ab61060

Please sign in to comment.