Skip to content

Commit

Permalink
Add calculate_cng_indices
Browse files Browse the repository at this point in the history
  • Loading branch information
sjawhar committed Mar 15, 2022
1 parent 94ac334 commit 62823a7
Showing 1 changed file with 67 additions and 1 deletion.
68 changes: 67 additions & 1 deletion factor_analyzer/factor_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,21 @@
"""

import warnings
from typing import Tuple

import numpy as np
import pandas as pd
import scipy as sp
from scipy.optimize import minimize
from scipy.stats import chi2, pearsonr
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_models import LinearRegression
from sklearn.utils import check_array
from sklearn.utils.extmath import randomized_svd
from sklearn.utils.validation import check_is_fitted

from .rotator import OBLIQUE_ROTATIONS, POSSIBLE_ROTATIONS, Rotator
from .utils import corr, impute_values, partial_correlations, smc
from .utils import corr, covariance_to_correlation, impute_values, partial_correlations, smc

POSSIBLE_SVDS = ['randomized', 'lapack']

Expand Down Expand Up @@ -114,6 +116,70 @@ def calculate_bartlett_sphericity(x):
return statistic, p_value


def calculate_cng_indices(
data: np.ndarray, model: str = "components"
) -> Tuple[int, pd.DataFrame]:
"""Calculate the Cattel-Nelson-Gorsuch indices, which are used to determine
the appropriate number of factors for a factor analysis.
Direct port of nCng function from nFactors package:
https://rdrr.io/cran/nFactors/man/nCng.html
Parameters
----------
data : array-like
The array of samples x observable for which to calculate CNG indices
model : str
"components" or "factors"
Returns
-------
num_factors : int
The number of components/factors to retain
details : pd.DataFrame
The eigenvalues and CNG indices of the dataset
"""
data = corr(data.values)
if model == "factors":
data -= np.linalg.pinv(np.diag(np.diag(np.linalg.pinv(data))))
# TODO: Should this line be here?
data = covariance_to_correlation(data)

values = np.sort(np.linalg.eigvals(data))[::-1]

num_variables = len(data)
if num_variables < 6:
raise ValueError("The number of variables must be at least 6")

fit_size = 3
cng = np.diff(
[
[
LinearRegression()
.fit(idx_values[:, np.newaxis], values[idx_values])
.coef_
for idx_values in [
np.arange(idx_fit, idx_fit + fit_size),
np.arange(idx_fit + fit_size, idx_fit + 2 * fit_size),
]
]
for idx_fit in range(num_variables - 2 * fit_size)
],
axis=1,
).squeeze(axis=(1, 2))

num_factors = np.nanargmax(cng) + fit_size

details = pd.DataFrame(
{
"data": values[: len(cng)],
"cng": cng,
}
)

return num_factors, details


class FactorAnalyzer(BaseEstimator, TransformerMixin):
"""
A FactorAnalyzer class, which -
Expand Down

0 comments on commit 62823a7

Please sign in to comment.