Source code for qbiocode.utils.qc_winner_finder

## function to find datasets where QML methods did better than classical
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os


[docs]
def qml_winner(results_df, rawevals_df, output_dir, tag):
    """This function finds data sets where QML was beneficial (higher F1 scores than CML) and create new .csv files
    with the relevant evaluation and performance for these specific datasets, for further analysis.  
    It also computes the best results per method across all splits and the best results per dataset.
    It returns two DataFrames: one with the datasets where QML methods outperformed CML methods, and another with the
    evaluation scores for the best QML method for each of these datasets.
    It also saves these DataFrames as .csv files in the specified output directory.
    
    Args:
        results_df (pandas.DataFrame): Dataset in pandas corresponding to 'ModelResults.csv'
        rawevals_df (pandas.DataFrame): Dataset in pandas corresponding to 'RawDataEvaluation.csv'
    Returns: 
        qml_winners (pandas.DataFrame): contais the input datasets for which at least one QML method
                                        performed better than CML. DataFrame contains the scores of all
                                        the methods. 
        winner_eval_score (pandas.DataFrame): contains the input datasets, their evaluation, and scores for the 
                                            specific qml method that yielded the best score.
    """
    
    # pass in the ML results
    df = results_df.copy()
    # pull in the raw evaluations
    rawevals = rawevals_df.copy()
    #first, compute mean across all splits 
    if 'Model_Parameters' in df.columns:
        df_across_split= df.groupby(['Dataset', 'embeddings', 'model', 'Model_Parameters'])['f1_score'].mean().reset_index()
    else: 
        # if 'Model_Parameters' is not present, this means you ran a grid search and this column will be named 'BestParams_GridSearch' instead
        df_across_split= df.groupby(['Dataset', 'embeddings', 'model', 'BestParams_GridSearch'])['f1_score'].mean().reset_index()
    #now, extract the best results per method across embedding and iteration
    df_best = df_across_split.groupby(['Dataset', 'model'])['f1_score'].max().reset_index()
    #df_best = df_across_split.groupby(['Dataset', 'model', 'Model_Parameters'])['f1_score'].max().reset_index()
    df_best.to_csv(( os.path.join( output_dir,  tag +'_best_across_split.csv')), index=False)
    # get summary accross all datasets
    df_best_model_mean = df_best.groupby('model')['f1_score'].mean()
    df_best_model_median = df_best.groupby('model')['f1_score'].median()
    df_best_model_max = df_best.groupby('model')['f1_score'].max()
    df_best_model_std = df_best.groupby('model')['f1_score'].std()
    df_best_permodel_summary = pd.concat([df_best_model_mean, df_best_model_median, df_best_model_max, df_best_model_std], axis=1)
    df_best_permodel_summary.columns = ['Mean_F1_Score', 'Median_F1_Score', 'Max_F1_Score', 'StandardDev_F1_Score']
    df_best_permodel_summary.to_csv(( os.path.join( output_dir,  tag +'_best_permodel_summary.csv')))
    # print(df_best_permodel_summary)
    
    # extract the best results per dataset
    best_per_dataset = df_best.loc[df_best.groupby('Dataset')['f1_score'].idxmax()]
    # best_per_dataset = df_across_split.loc[df_across_split.groupby('Dataset')['f1_score'].idxmax()]
    # create list of qml methods
    qml_list = ['QSVC', 'QNN', 'VQC', 'PQK']
    # qml_winner = df_best[df_best['Dataset'].isin(best_per_dataset[best_per_dataset['model'].isin(qml_list)]['Dataset'])]
    qml_winner = df_across_split[df_across_split['Dataset'].isin(best_per_dataset[best_per_dataset['model'].isin(qml_list)]['Dataset'])]
    if not qml_winner.empty:
        bestmethod = qml_winner.groupby('Dataset')['f1_score'].idxmax()
        qc_method_and_score = qml_winner.loc[bestmethod]
        qml_winner.to_csv(( os.path.join( output_dir,  tag +'_qml_winners.csv')), index=False)
        dataset = list(qml_winner['Dataset'].unique())
        
        #######
        # now let's find the raw data evaluations for the qml winner data sets
        # this wil produce another csv file that contains scores, evaluation, and qml method
        # for these "qml winners".
        winner_evals = []
        for file in dataset:
            eval = rawevals.loc[rawevals['Dataset'] == file]
            # print(eval)
            winner_evals.append(eval)
        winner_evals_df = pd.concat(winner_evals)
        winner_evals_df.to_csv(( os.path.join( output_dir,  tag +'_winner_evals.csv')), index=False)
        winner_scores_df = qc_method_and_score.iloc[:, -3:]
        winner_scores_df.to_csv(( os.path.join( output_dir,  tag +'_winner_score.csv')), index=False)

        print(winner_scores_df)
        winner_eval_score = pd.concat([winner_evals_df, winner_scores_df], axis=1)
        winner_eval_score.to_csv(( os.path.join( output_dir,  tag +'_winner_eval_score.csv')), index=False) # contains dataset, evaluation, qml method, and  average f1 score 
        #######
        
        # optional print statements
        print('*** The number of qml winners is', len(dataset))
        print('*** The qml winners are:', dataset)
        
        return qml_winner, winner_eval_score, df_best
    
    else:
        print('*** QML methods were outperformed by CML methods in all datasets ***')
    
        return