Source code for qbiocode.evaluation.dataset_evaluation

# ====== Base class imports ======
import numpy as np
import pandas as pd
import hfda

# ====== Scipy imports ======
from scipy.stats import entropy
from scipy.linalg import norm, inv, eigvals
from scipy.spatial import ConvexHull as CH

# ====== Scikit-learn imports ======
from sklearn import datasets
from skdim import id
from skdim.id import lPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import mutual_info_classif, VarianceThreshold
from sklearn.neighbors import KernelDensity
from sklearn.manifold import Isomap

import warnings

# df = pd.DataFrame(X)

[docs]
def get_dimensions(df):
    """Get the number of features, samples, and feature-to-sample ratio from a DataFrame.
    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
    Returns:
        tuple: (num_features, num_samples, ratio)
            - num_features (int): Number of features in the DataFrame
            - num_samples (int): Number of samples in the DataFrame
            - ratio (float): Feature-to-sample ratio
    """ 
    # number of features
    num_features = df.shape[1]
    # of samples
    num_samples = df.shape[0]
    # feature-to-sample ratio 
    ratio = num_features/num_samples
    
    return num_features, num_samples, ratio 



[docs]
def get_intrinsic_dim(df):
    """Get intrinsic dimension of the data using lPCA from skdim.
    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
    Returns:
        float: Intrinsic dimension of the data
    """ 
    # Intrinsic dimension, calculated via scikit-dimension's PCA method
    pca = id.lPCA() # Initialize the PCA estimator from skdim
    pca.fit(df) # Fit the estimator to your data
    return pca.dimension_ 



[docs]
def get_condition_number(df):
    """Get condition number of a matrix.
        A function with a high condition number is said to be ill-conditioned. 
        Ill conditioned matrices produce large errors in its output even with small errors in its input. 
        Low condition number means more stable errors. 
    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns

    Returns:
        float: condition number of the matrix represented in df
    """
    # In general,  
    # meaning that it can produce large errors in its output even with small errors in its input. 
    # Conversely, a function with a low condition number is well-conditioned and more stable in terms of its output.
    return np.linalg.cond(df)



[docs]
def get_fdr(df,y): 
    """Calculate Fisher Discriminant Ratio for a given dataset. 

    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
        y (int): supervised binary class label
        
    Returns:
        float: Fisher Discriminant ratio
    """
    X = df.values
    class_labels = np.unique(y)
    n_classes = len(class_labels)
    FDR = 0 
    
    if n_classes != 2: 
        warnings.warn("WARNING: Fisher Discriminant Ratio is only defined for binary classes. ")
    else: 
        mean1 = np.mean(X[y == class_labels[0]], axis=0) #mean for class1 
        mean2 = np.mean(X[y == class_labels[1]], axis=0) #mean for class2
        
        #calculate within-class scatter matrices
        scatter_within = np.zeros((X.shape[1], X.shape[1]))
        for label in class_labels: 
            X_class = X[y == label]
            scatter_within += np.cov(X_class.T)
        
        #calculate between-class scatter matrix
        scatter_between = np.outer(mean1 - mean2, mean1 - mean2)
        
        #compute FDR
        FDR = np.trace(scatter_between)/np.trace(scatter_within)
        
    return FDR        

        

[docs]
def get_total_correlation(df):
    """Calculate Total Correlation 
    
    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns

    Returns:
        float: Total correlation
    """
    corr_matrix = df.corr() #correlation matrix 
    #total correlation by subtracting diagonal values to remove self-correlation
    total_correlation = corr_matrix.abs().sum().sum() - len(df.columns) 
    
    return total_correlation



[docs]
def get_mutual_information(df, y): 
    """Calculate mutual information via sklearn

    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
        y (int): supervised binary class label

    Returns:
        float: Mutual information
    """
    mutual_info = np.mean(mutual_info_classif(df, y))
    
    return mutual_info



[docs]
def get_variance(df): 
    """Get variance

    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns

    Returns:
        avg_var (float): Mean variance
        std_var (float): Standard deviation of variance
    """
    variations = round(df.var(), 2)
    avg_var = variations.mean()
    std_var = variations.std()
    
    return avg_var, std_var



[docs]
def get_coefficient_var(df): 
    """Get coefficient of variance

    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns

    Returns:
        avg_co_of_v (float): Mean coefficient of variance
        std_var (float): Standard deviation of coefficient of variance
    """
    co_of_v = (df.std() / df.mean()) * 100
    avg_co_of_v = co_of_v.mean()
    std_co_of_v = co_of_v.std()
    
    return avg_co_of_v, std_co_of_v



[docs]
def get_nnz(df): 
    """Calculate nonzero values in the data

    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns

    Returns:
        int: nonzero count 
    """
    return np.count_nonzero(df.values)



[docs]
def get_low_var_features(df, num_features): 
    """Calculate get count of low variance features

    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
        num_features (int): number of features in the dataset
    
    Raises:
        ValueError: If no feature is strong enough to keep

    Returns:
        int: count of features with low variance
    """
    
    threshold = np.percentile(df.var(), 25)
    
    try:
        low_var_features =  num_features - VarianceThreshold(threshold).fit(df).get_support().sum() 
    except ValueError:
        print("No feature is strong enough to keep")
        low_var_features = None
    
    return low_var_features



[docs]
def get_log_density(df): 
    """Calculate the mean log density of the data

    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns

    Returns:
        float: mean log kernel density
    """
    kde = KernelDensity(bandwidth=0.2, kernel='gaussian').fit(df) # Create a KernelDensity estimator and fit the estimator to the data
    log_density = kde.score_samples(df)
    
    return log_density.mean()



[docs]
def get_fractal_dim(df, k_max):
    """Calculate the fractal dimension of the data using Higuchi's method
    
    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
        k_max (int): Maximum number of k values to use in the calculation
        
    Returns:
        float: Fractal dimension of the data
    """
    FD = hfda.measure(df, k_max)
    
    return FD 




[docs]
def get_moments(df): 
    """Compute third and fourth order moments of the data 

    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns

    Returns:
        avg_skew (float): Mean skewness
        std_skew (float): Standard deviation of skewness
        avg_kurt (float): Mean kurtosis
        std_kurt (float): Standard deviation of kurtosis
    """
    # Skewness
    skew = df.skew()
    avg_skew = skew.mean()
    std_skew = skew.std()
    # Kurtosis
    kurt = df.kurtosis()
    avg_kurt = kurt.mean()
    std_kurt = kurt.std()
    
    return avg_skew, std_skew, avg_kurt, std_kurt 



[docs]
def get_entropy(y): 
    """Calculate entropy of the target variable

    Args:
        y (int): supervised binary class label 
        
    Returns: 
        avg_y_entropy (float): mean entropy 
        std_y_entropy (flat): standard deviation of entropy 
    """
    y_entropy = entropy(np.bincount(y), base=2) # Compute the entropy of the target variable (y)
    avg_y_entropy = y_entropy.mean()
    std_y_entropy = y_entropy.std()
    
    return avg_y_entropy, std_y_entropy



[docs]
def get_volume(df): 
    """Get volume of the data from Convex Hull 

    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
        
    Returns: 
        volume (float): Volume of the space spanned by the features of the data 
    """
    
    vol = 0 
    if df.shape[0] <= df.shape[1]: 
        warnings.warn("Convex Hull requires number of observations > number of features")
    else: 
        vol = CH(df, qhull_options='QJ').volume 
    
    return vol



[docs]
def get_complexity(df, n_neighbors=10, n_components=2): 
    """ Measure the manifold complexity by fitting Isomap and analyzing the geodesic vs. Euclidean distances.
    This function computes the reconstruction error of the Isomap algorithm, which serves as an indicator of the complexity of the manifold represented by the data.

    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
        n_neighbors: Number of neighbors for the Isomap algorithm. Default value 10
        n_components: Number of components (dimensions) for Isomap projection.  Default value 2
        
    Returns:
        - reconstruction_error: float
            The reconstruction error of the Isomap model, which indicates the complexity of the manifold.
        - reconstruction_error: The residual error of geodesic distances
    """
    
    isomap = Isomap(n_neighbors=10, n_components=2)
    isomap.fit(df.values)
    
    #reconstruction error - an indicator of complexity 
    reconstruction_error = isomap.reconstruction_error()
    
    return reconstruction_error

    


[docs]
def evaluate(df, y, file):
    """This function evaluates a dataset and returns a transposed summary DataFrame with various statistical measures, derived from the dataset.
    Using the functions defined above, it computes intrinsic dimension, condition number, Fisher Discriminant Ratio, total correlation, mutual information, variance, coefficient of variation, 
    data sparsity, low variance features, data density, fractal dimension, data distributions (skewness and kurtosis), entropy of the target variable, and manifold complexity.
    The summary DataFrame is transposed for easier readability and contains the dataset name, number of features, number of samples, feature-to-sample ratio, and various statistical measures.
    This function is useful for quickly summarizing the characteristics of a dataset, especially in the context of machine learning and data analysis, allowing you to correlate the dataset's 
    properties with its performance in predictive modeling tasks.
    
    Args:
        df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
        y (int): supervised binary class label
        file (str): Name of the dataset file for identification in the summary DataFrame
        
    Returns:
        transposed (pandas.DataFrame): Summary DataFrame containing various statistical measures of the dataset
    """
    # Select only numeric columns from the DataFrame
    df_numeric = df.select_dtypes(include=[np.number])

    # Calculate statistical measures
    n_features, n_samples, feature_sample_ratio = get_dimensions(df_numeric)
    
    # get intrinsic dimension 
    intrinsic_dim = get_intrinsic_dim(df_numeric)
    
    # Condition number
    condition_number = get_condition_number(df_numeric)

    # Class imbalance ratio via Fischer Discriminant
    fdr = get_fdr(df_numeric, y)

    # Total correlation 
    total_correlation = get_total_correlation(df_numeric)

    # Mutual information
    mutual_info = get_mutual_information(df_numeric, y)

    # Variance
    avg_var, std_var = get_variance(df_numeric)

    # Coefficient of variance 
    avg_co_of_v, std_co_of_v = get_coefficient_var(df_numeric)
    
    # Data sparsity
    count_nonzero = get_nnz(df)
    
    # Get the number of low variance features
    num_low_variance_features = get_low_var_features(df_numeric, n_features)

    # Data density
    mean_log_density = get_log_density(df_numeric)

    # Fractal Dimension
    k_max = 5
    fractal_dim = get_fractal_dim(df_numeric, k_max)

    # Data distributions
    avg_skew, std_skew, avg_kurt, std_kurt = get_moments(df_numeric)
    
    # entropy
    avg_y_entropy, std_y_entropy = get_entropy(y)

    #volume of data
    # volume = get_volume(df_numeric)
    
    #manifold complexity
    complexity = get_complexity(df_numeric)
    
    # Create summary DataFrame
    summary_df = pd.DataFrame.from_dict({
                                        # Data set
                                        'Dataset': file,
                                        
                                        # Dimensions
                                        '# Features': n_features,
                                        '# Samples': n_samples,
                                        'Feature_Samples_ratio': feature_sample_ratio,
                                        
                                        # Intrinsic dimension
                                        'Intrinsic_Dimension': intrinsic_dim,
                                    
                                        # Condition number
                                        'Condition number': condition_number,
                                        
                                        # Class imbalance ratio
                                        'Fisher Discriminant Ratio': fdr, 
                                        
                                        # Feature Correlations
                                        'Total Correlations': total_correlation, # Total Correlations
                                        'Mutual information': mutual_info,# Mutual information
                                        
                                        # Data sparsity
                                        '# Non-zero entries': count_nonzero,
                                        '# Low variance features': num_low_variance_features,
                                        
                                        #'Variation': variations,
                                        'Variation': avg_var,
                                        'std_var': std_var,
                                        
                                        #'Coefficient of Variation %': co_of_v,
                                        'Coefficient of Variation %': avg_co_of_v,
                                        'std_co_of_v': std_co_of_v,
                                        
                                        # Data distributions
                                        #'Skewness': skew,
                                        'Skewness': avg_skew,
                                        'std_skew': std_skew,
                                        
                                        #'Kurtosis': kurt,
                                        'Kurtosis': avg_kurt,
                                        'std_kurt': std_kurt,
                                        
                                        # Data density
                                        'Mean Log Kernel Density': mean_log_density, 
                                        
                                        # volume of feature space
                                        #'Volume': volume, 
                                        
                                        # Manifold complexity
                                        'Isomap Reconstruction Error': complexity, 
                                        
                                        # Fractal dimension
                                        'Fractal dimension': fractal_dim, # calculated via Higuchi Dimension

                                        #'Entropy': y_entropy,
                                        'Entropy': avg_y_entropy,
                                        'std_entropy': std_y_entropy
                                        },
                                        orient='index')

    transposed = summary_df.T
    #transposed.to_csv('DataSetEvaluation.csv', sep='\t', index=False)
    #print(transposed)
    return transposed


# evaluate(df,y)