Source code for qbiocode.utils.qc_winner_finder

## function to find datasets where QML methods did better than classical
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

[docs] def qml_winner(results_df, rawevals_df, output_dir, tag): """This function finds data sets where QML was beneficial (higher F1 scores than CML) and create new .csv files with the relevant evaluation and performance for these specific datasets, for further analysis. It also computes the best results per method across all splits and the best results per dataset. It returns two DataFrames: one with the datasets where QML methods outperformed CML methods, and another with the evaluation scores for the best QML method for each of these datasets. It also saves these DataFrames as .csv files in the specified output directory. Args: results_df (pandas.DataFrame): Dataset in pandas corresponding to 'ModelResults.csv' rawevals_df (pandas.DataFrame): Dataset in pandas corresponding to 'RawDataEvaluation.csv' Returns: qml_winners (pandas.DataFrame): contais the input datasets for which at least one QML method performed better than CML. DataFrame contains the scores of all the methods. winner_eval_score (pandas.DataFrame): contains the input datasets, their evaluation, and scores for the specific qml method that yielded the best score. """ # pass in the ML results df = results_df.copy() # pull in the raw evaluations rawevals = rawevals_df.copy() #first, compute mean across all splits if 'Model_Parameters' in df.columns: df_across_split= df.groupby(['Dataset', 'embeddings', 'model', 'Model_Parameters'])['f1_score'].mean().reset_index() else: # if 'Model_Parameters' is not present, this means you ran a grid search and this column will be named 'BestParams_GridSearch' instead df_across_split= df.groupby(['Dataset', 'embeddings', 'model', 'BestParams_GridSearch'])['f1_score'].mean().reset_index() #now, extract the best results per method across embedding and iteration df_best = df_across_split.groupby(['Dataset', 'model'])['f1_score'].max().reset_index() #df_best = df_across_split.groupby(['Dataset', 'model', 'Model_Parameters'])['f1_score'].max().reset_index() df_best.to_csv(( os.path.join( output_dir, tag +'_best_across_split.csv')), index=False) # get summary accross all datasets df_best_model_mean = df_best.groupby('model')['f1_score'].mean() df_best_model_median = df_best.groupby('model')['f1_score'].median() df_best_model_max = df_best.groupby('model')['f1_score'].max() df_best_model_std = df_best.groupby('model')['f1_score'].std() df_best_permodel_summary = pd.concat([df_best_model_mean, df_best_model_median, df_best_model_max, df_best_model_std], axis=1) df_best_permodel_summary.columns = ['Mean_F1_Score', 'Median_F1_Score', 'Max_F1_Score', 'StandardDev_F1_Score'] df_best_permodel_summary.to_csv(( os.path.join( output_dir, tag +'_best_permodel_summary.csv'))) # print(df_best_permodel_summary) # extract the best results per dataset best_per_dataset = df_best.loc[df_best.groupby('Dataset')['f1_score'].idxmax()] # best_per_dataset = df_across_split.loc[df_across_split.groupby('Dataset')['f1_score'].idxmax()] # create list of qml methods qml_list = ['QSVC', 'QNN', 'VQC', 'PQK'] # qml_winner = df_best[df_best['Dataset'].isin(best_per_dataset[best_per_dataset['model'].isin(qml_list)]['Dataset'])] qml_winner = df_across_split[df_across_split['Dataset'].isin(best_per_dataset[best_per_dataset['model'].isin(qml_list)]['Dataset'])] if not qml_winner.empty: bestmethod = qml_winner.groupby('Dataset')['f1_score'].idxmax() qc_method_and_score = qml_winner.loc[bestmethod] qml_winner.to_csv(( os.path.join( output_dir, tag +'_qml_winners.csv')), index=False) dataset = list(qml_winner['Dataset'].unique()) ####### # now let's find the raw data evaluations for the qml winner data sets # this wil produce another csv file that contains scores, evaluation, and qml method # for these "qml winners". winner_evals = [] for file in dataset: eval = rawevals.loc[rawevals['Dataset'] == file] # print(eval) winner_evals.append(eval) winner_evals_df = pd.concat(winner_evals) winner_evals_df.to_csv(( os.path.join( output_dir, tag +'_winner_evals.csv')), index=False) winner_scores_df = qc_method_and_score.iloc[:, -3:] winner_scores_df.to_csv(( os.path.join( output_dir, tag +'_winner_score.csv')), index=False) print(winner_scores_df) winner_eval_score = pd.concat([winner_evals_df, winner_scores_df], axis=1) winner_eval_score.to_csv(( os.path.join( output_dir, tag +'_winner_eval_score.csv')), index=False) # contains dataset, evaluation, qml method, and average f1 score ####### # optional print statements print('*** The number of qml winners is', len(dataset)) print('*** The qml winners are:', dataset) return qml_winner, winner_eval_score, df_best else: print('*** QML methods were outperformed by CML methods in all datasets ***') return