# ====== Base class imports ======
import numpy as np
import pandas as pd
import hfda
# ====== Scipy imports ======
from scipy.stats import entropy
from scipy.linalg import norm, inv, eigvals
from scipy.spatial import ConvexHull as CH
# ====== Scikit-learn imports ======
from sklearn import datasets
from skdim import id
from skdim.id import lPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import mutual_info_classif, VarianceThreshold
from sklearn.neighbors import KernelDensity
from sklearn.manifold import Isomap
import warnings
# df = pd.DataFrame(X)
[docs]
def get_dimensions(df):
"""Get the number of features, samples, and feature-to-sample ratio from a DataFrame.
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
Returns:
tuple: (num_features, num_samples, ratio)
- num_features (int): Number of features in the DataFrame
- num_samples (int): Number of samples in the DataFrame
- ratio (float): Feature-to-sample ratio
"""
# number of features
num_features = df.shape[1]
# of samples
num_samples = df.shape[0]
# feature-to-sample ratio
ratio = num_features/num_samples
return num_features, num_samples, ratio
[docs]
def get_intrinsic_dim(df):
"""Get intrinsic dimension of the data using lPCA from skdim.
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
Returns:
float: Intrinsic dimension of the data
"""
# Intrinsic dimension, calculated via scikit-dimension's PCA method
pca = id.lPCA() # Initialize the PCA estimator from skdim
pca.fit(df) # Fit the estimator to your data
return pca.dimension_
[docs]
def get_condition_number(df):
"""Get condition number of a matrix.
A function with a high condition number is said to be ill-conditioned.
Ill conditioned matrices produce large errors in its output even with small errors in its input.
Low condition number means more stable errors.
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
Returns:
float: condition number of the matrix represented in df
"""
# In general,
# meaning that it can produce large errors in its output even with small errors in its input.
# Conversely, a function with a low condition number is well-conditioned and more stable in terms of its output.
return np.linalg.cond(df)
[docs]
def get_fdr(df,y):
"""Calculate Fisher Discriminant Ratio for a given dataset.
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
y (int): supervised binary class label
Returns:
float: Fisher Discriminant ratio
"""
X = df.values
class_labels = np.unique(y)
n_classes = len(class_labels)
FDR = 0
if n_classes != 2:
warnings.warn("WARNING: Fisher Discriminant Ratio is only defined for binary classes. ")
else:
mean1 = np.mean(X[y == class_labels[0]], axis=0) #mean for class1
mean2 = np.mean(X[y == class_labels[1]], axis=0) #mean for class2
#calculate within-class scatter matrices
scatter_within = np.zeros((X.shape[1], X.shape[1]))
for label in class_labels:
X_class = X[y == label]
scatter_within += np.cov(X_class.T)
#calculate between-class scatter matrix
scatter_between = np.outer(mean1 - mean2, mean1 - mean2)
#compute FDR
FDR = np.trace(scatter_between)/np.trace(scatter_within)
return FDR
[docs]
def get_total_correlation(df):
"""Calculate Total Correlation
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
Returns:
float: Total correlation
"""
corr_matrix = df.corr() #correlation matrix
#total correlation by subtracting diagonal values to remove self-correlation
total_correlation = corr_matrix.abs().sum().sum() - len(df.columns)
return total_correlation
[docs]
def get_variance(df):
"""Get variance
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
Returns:
avg_var (float): Mean variance
std_var (float): Standard deviation of variance
"""
variations = round(df.var(), 2)
avg_var = variations.mean()
std_var = variations.std()
return avg_var, std_var
[docs]
def get_coefficient_var(df):
"""Get coefficient of variance
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
Returns:
avg_co_of_v (float): Mean coefficient of variance
std_var (float): Standard deviation of coefficient of variance
"""
co_of_v = (df.std() / df.mean()) * 100
avg_co_of_v = co_of_v.mean()
std_co_of_v = co_of_v.std()
return avg_co_of_v, std_co_of_v
[docs]
def get_nnz(df):
"""Calculate nonzero values in the data
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
Returns:
int: nonzero count
"""
return np.count_nonzero(df.values)
[docs]
def get_low_var_features(df, num_features):
"""Calculate get count of low variance features
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
num_features (int): number of features in the dataset
Raises:
ValueError: If no feature is strong enough to keep
Returns:
int: count of features with low variance
"""
threshold = np.percentile(df.var(), 25)
try:
low_var_features = num_features - VarianceThreshold(threshold).fit(df).get_support().sum()
except ValueError:
print("No feature is strong enough to keep")
low_var_features = None
return low_var_features
[docs]
def get_log_density(df):
"""Calculate the mean log density of the data
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
Returns:
float: mean log kernel density
"""
kde = KernelDensity(bandwidth=0.2, kernel='gaussian').fit(df) # Create a KernelDensity estimator and fit the estimator to the data
log_density = kde.score_samples(df)
return log_density.mean()
[docs]
def get_fractal_dim(df, k_max):
"""Calculate the fractal dimension of the data using Higuchi's method
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
k_max (int): Maximum number of k values to use in the calculation
Returns:
float: Fractal dimension of the data
"""
FD = hfda.measure(df, k_max)
return FD
[docs]
def get_moments(df):
"""Compute third and fourth order moments of the data
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
Returns:
avg_skew (float): Mean skewness
std_skew (float): Standard deviation of skewness
avg_kurt (float): Mean kurtosis
std_kurt (float): Standard deviation of kurtosis
"""
# Skewness
skew = df.skew()
avg_skew = skew.mean()
std_skew = skew.std()
# Kurtosis
kurt = df.kurtosis()
avg_kurt = kurt.mean()
std_kurt = kurt.std()
return avg_skew, std_skew, avg_kurt, std_kurt
[docs]
def get_entropy(y):
"""Calculate entropy of the target variable
Args:
y (int): supervised binary class label
Returns:
avg_y_entropy (float): mean entropy
std_y_entropy (flat): standard deviation of entropy
"""
y_entropy = entropy(np.bincount(y), base=2) # Compute the entropy of the target variable (y)
avg_y_entropy = y_entropy.mean()
std_y_entropy = y_entropy.std()
return avg_y_entropy, std_y_entropy
[docs]
def get_volume(df):
"""Get volume of the data from Convex Hull
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
Returns:
volume (float): Volume of the space spanned by the features of the data
"""
vol = 0
if df.shape[0] <= df.shape[1]:
warnings.warn("Convex Hull requires number of observations > number of features")
else:
vol = CH(df, qhull_options='QJ').volume
return vol
[docs]
def get_complexity(df, n_neighbors=10, n_components=2):
""" Measure the manifold complexity by fitting Isomap and analyzing the geodesic vs. Euclidean distances.
This function computes the reconstruction error of the Isomap algorithm, which serves as an indicator of the complexity of the manifold represented by the data.
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
n_neighbors: Number of neighbors for the Isomap algorithm. Default value 10
n_components: Number of components (dimensions) for Isomap projection. Default value 2
Returns:
- reconstruction_error: float
The reconstruction error of the Isomap model, which indicates the complexity of the manifold.
- reconstruction_error: The residual error of geodesic distances
"""
isomap = Isomap(n_neighbors=10, n_components=2)
isomap.fit(df.values)
#reconstruction error - an indicator of complexity
reconstruction_error = isomap.reconstruction_error()
return reconstruction_error
[docs]
def evaluate(df, y, file):
"""This function evaluates a dataset and returns a transposed summary DataFrame with various statistical measures, derived from the dataset.
Using the functions defined above, it computes intrinsic dimension, condition number, Fisher Discriminant Ratio, total correlation, mutual information, variance, coefficient of variation,
data sparsity, low variance features, data density, fractal dimension, data distributions (skewness and kurtosis), entropy of the target variable, and manifold complexity.
The summary DataFrame is transposed for easier readability and contains the dataset name, number of features, number of samples, feature-to-sample ratio, and various statistical measures.
This function is useful for quickly summarizing the characteristics of a dataset, especially in the context of machine learning and data analysis, allowing you to correlate the dataset's
properties with its performance in predictive modeling tasks.
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
y (int): supervised binary class label
file (str): Name of the dataset file for identification in the summary DataFrame
Returns:
transposed (pandas.DataFrame): Summary DataFrame containing various statistical measures of the dataset
"""
# Select only numeric columns from the DataFrame
df_numeric = df.select_dtypes(include=[np.number])
# Calculate statistical measures
n_features, n_samples, feature_sample_ratio = get_dimensions(df_numeric)
# get intrinsic dimension
intrinsic_dim = get_intrinsic_dim(df_numeric)
# Condition number
condition_number = get_condition_number(df_numeric)
# Class imbalance ratio via Fischer Discriminant
fdr = get_fdr(df_numeric, y)
# Total correlation
total_correlation = get_total_correlation(df_numeric)
# Mutual information
mutual_info = get_mutual_information(df_numeric, y)
# Variance
avg_var, std_var = get_variance(df_numeric)
# Coefficient of variance
avg_co_of_v, std_co_of_v = get_coefficient_var(df_numeric)
# Data sparsity
count_nonzero = get_nnz(df)
# Get the number of low variance features
num_low_variance_features = get_low_var_features(df_numeric, n_features)
# Data density
mean_log_density = get_log_density(df_numeric)
# Fractal Dimension
k_max = 5
fractal_dim = get_fractal_dim(df_numeric, k_max)
# Data distributions
avg_skew, std_skew, avg_kurt, std_kurt = get_moments(df_numeric)
# entropy
avg_y_entropy, std_y_entropy = get_entropy(y)
#volume of data
# volume = get_volume(df_numeric)
#manifold complexity
complexity = get_complexity(df_numeric)
# Create summary DataFrame
summary_df = pd.DataFrame.from_dict({
# Data set
'Dataset': file,
# Dimensions
'# Features': n_features,
'# Samples': n_samples,
'Feature_Samples_ratio': feature_sample_ratio,
# Intrinsic dimension
'Intrinsic_Dimension': intrinsic_dim,
# Condition number
'Condition number': condition_number,
# Class imbalance ratio
'Fisher Discriminant Ratio': fdr,
# Feature Correlations
'Total Correlations': total_correlation, # Total Correlations
'Mutual information': mutual_info,# Mutual information
# Data sparsity
'# Non-zero entries': count_nonzero,
'# Low variance features': num_low_variance_features,
#'Variation': variations,
'Variation': avg_var,
'std_var': std_var,
#'Coefficient of Variation %': co_of_v,
'Coefficient of Variation %': avg_co_of_v,
'std_co_of_v': std_co_of_v,
# Data distributions
#'Skewness': skew,
'Skewness': avg_skew,
'std_skew': std_skew,
#'Kurtosis': kurt,
'Kurtosis': avg_kurt,
'std_kurt': std_kurt,
# Data density
'Mean Log Kernel Density': mean_log_density,
# volume of feature space
#'Volume': volume,
# Manifold complexity
'Isomap Reconstruction Error': complexity,
# Fractal dimension
'Fractal dimension': fractal_dim, # calculated via Higuchi Dimension
#'Entropy': y_entropy,
'Entropy': avg_y_entropy,
'std_entropy': std_y_entropy
},
orient='index')
transposed = summary_df.T
#transposed.to_csv('DataSetEvaluation.csv', sep='\t', index=False)
#print(transposed)
return transposed
# evaluate(df,y)