Skip to content

Commonsense MCQA metrics

aisteer360.evaluation.metrics.custom.commonsense_mcqa

Evaluation metrics for the CommonsenseMCQA use case.

mcqa_accuracy

MCQAAccuracy

Bases: Metric

Exact-match accuracy for multiple-choice QA.

Source code in aisteer360/evaluation/metrics/custom/commonsense_mcqa/mcqa_accuracy.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
class MCQAAccuracy(Metric):
    """
    Exact-match accuracy for multiple-choice QA.
    """

    def compute(
        self,
        responses: list[str],
        prompts: list[str] | None = None,
        reference_answers: list[str] | None = None,
        question_ids: list[str] | None = None,
        **kwargs
    ) -> dict[str, float]:
        """Computes trial-level and question-level accuracy metrics.

        Args:
            responses: List of predicted answer choices (e.g., 'A', 'B', 'C', 'D').
            prompts: List of question prompts (unused, for interface compatibility).
            reference_answers: List of correct answer choices.
            question_ids: Optional question IDs for grouping responses by question.
            **kwargs: Additional arguments (unused).

        Returns:
            Dictionary of accuracy score statistics with values:

                - "trial_mean": micro (attempt-level accuracy)
                - "trial_std": sample std-dev over trials
                - "question_mean": macro (majority-vote accuracy)
                - "question_std": sample std-dev over questions

        Raises:
            ValueError: If reference_answers is None or length mismatches occur.
        """

        if reference_answers is None:
            raise ValueError("MCQAAccuracy needs `reference_answers`.")
        if len(responses) != len(reference_answers):
            raise ValueError("`responses` and `reference_answers` must be the same length.")
        if question_ids is not None and len(responses) != len(question_ids):
            raise ValueError("`question_ids` must match length of `responses`.")

        # micro
        attempt_correct = [
            choice.strip().upper() == answer.strip().upper()
            for choice, answer in zip(responses, reference_answers) if choice is not None
        ]
        attempt_accuracy = sum(attempt_correct) / len(attempt_correct) if attempt_correct else 0.0
        attempt_accuracy_std = self._sample_std(attempt_correct, attempt_accuracy)

        # macro
        if question_ids is None:
            question_accuracy = attempt_accuracy
        else:
            votes = defaultdict(list)
            for qid, is_correct in zip(question_ids, attempt_correct):
                votes[qid].append(is_correct)

            majority_outcomes = [int(sum(vote) > len(vote) / 2) for vote in votes.values()]
            question_accuracy = sum(majority_outcomes) / len(votes) if votes else 0.0
            question_accuracy_std = self._sample_std(majority_outcomes, question_accuracy)

        return {
            "trial_mean": attempt_accuracy,
            "trial_std": attempt_accuracy_std,
            "question_mean": question_accuracy,
            "question_std": question_accuracy_std,
        }

    @staticmethod
    def _sample_std(binary, mean):
        """Computes sample standard deviation for binary outcomes.

        Args:
            binary: List of binary values (0 or 1).
            mean: Pre-computed mean of the binary values.

        Returns:
            Sample standard deviation using Bessel's correction (n-1).
        """
        n = len(binary)
        if n < 2:
            return 0.0
        var = sum((x - mean) ** 2 for x in binary) / (n - 1)
        return sqrt(var)
extras = extras instance-attribute
name = self.__class__.__name__ instance-attribute
compute(responses, prompts=None, reference_answers=None, question_ids=None, **kwargs)

Computes trial-level and question-level accuracy metrics.

Parameters:

Name Type Description Default
responses list[str]

List of predicted answer choices (e.g., 'A', 'B', 'C', 'D').

required
prompts list[str] | None

List of question prompts (unused, for interface compatibility).

None
reference_answers list[str] | None

List of correct answer choices.

None
question_ids list[str] | None

Optional question IDs for grouping responses by question.

None
**kwargs

Additional arguments (unused).

{}

Returns:

Type Description
dict[str, float]

Dictionary of accuracy score statistics with values:

  • "trial_mean": micro (attempt-level accuracy)
  • "trial_std": sample std-dev over trials
  • "question_mean": macro (majority-vote accuracy)
  • "question_std": sample std-dev over questions

Raises:

Type Description
ValueError

If reference_answers is None or length mismatches occur.

Source code in aisteer360/evaluation/metrics/custom/commonsense_mcqa/mcqa_accuracy.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def compute(
    self,
    responses: list[str],
    prompts: list[str] | None = None,
    reference_answers: list[str] | None = None,
    question_ids: list[str] | None = None,
    **kwargs
) -> dict[str, float]:
    """Computes trial-level and question-level accuracy metrics.

    Args:
        responses: List of predicted answer choices (e.g., 'A', 'B', 'C', 'D').
        prompts: List of question prompts (unused, for interface compatibility).
        reference_answers: List of correct answer choices.
        question_ids: Optional question IDs for grouping responses by question.
        **kwargs: Additional arguments (unused).

    Returns:
        Dictionary of accuracy score statistics with values:

            - "trial_mean": micro (attempt-level accuracy)
            - "trial_std": sample std-dev over trials
            - "question_mean": macro (majority-vote accuracy)
            - "question_std": sample std-dev over questions

    Raises:
        ValueError: If reference_answers is None or length mismatches occur.
    """

    if reference_answers is None:
        raise ValueError("MCQAAccuracy needs `reference_answers`.")
    if len(responses) != len(reference_answers):
        raise ValueError("`responses` and `reference_answers` must be the same length.")
    if question_ids is not None and len(responses) != len(question_ids):
        raise ValueError("`question_ids` must match length of `responses`.")

    # micro
    attempt_correct = [
        choice.strip().upper() == answer.strip().upper()
        for choice, answer in zip(responses, reference_answers) if choice is not None
    ]
    attempt_accuracy = sum(attempt_correct) / len(attempt_correct) if attempt_correct else 0.0
    attempt_accuracy_std = self._sample_std(attempt_correct, attempt_accuracy)

    # macro
    if question_ids is None:
        question_accuracy = attempt_accuracy
    else:
        votes = defaultdict(list)
        for qid, is_correct in zip(question_ids, attempt_correct):
            votes[qid].append(is_correct)

        majority_outcomes = [int(sum(vote) > len(vote) / 2) for vote in votes.values()]
        question_accuracy = sum(majority_outcomes) / len(votes) if votes else 0.0
        question_accuracy_std = self._sample_std(majority_outcomes, question_accuracy)

    return {
        "trial_mean": attempt_accuracy,
        "trial_std": attempt_accuracy_std,
        "question_mean": question_accuracy,
        "question_std": question_accuracy_std,
    }

mcqa_calibration

MCQACalibration

Bases: Metric

Calibration metrics for multiple-choice QA.

Measures how well model confidence scores align with actual performance using Expected Calibration Error (ECE) and related metrics.

Source code in aisteer360/evaluation/metrics/custom/commonsense_mcqa/mcqa_calibration.py
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
class MCQACalibration(Metric):
    """
    Calibration metrics for multiple-choice QA.

    Measures how well model confidence scores align with actual performance using Expected Calibration Error (ECE)
    and related metrics.
    """

    def __init__(self, n_bins: int = 10):
        super().__init__()
        self.n_bins = n_bins

    def compute(
        self,
        responses: list[str],
        reference_answers: list[str] = None,
        confidence_scores: list[float] = None,
        question_ids: list[str] | None = None,
        **kwargs
    ) -> dict[str, float]:
        """Computes calibration metrics for model predictions.

        Args:
            responses: List of predicted answer choices (e.g., 'A', 'B', 'C', 'D').
            reference_answers: List of correct answer choices.
            confidence_scores: List of model confidence scores (0.0 to 1.0).
            question_ids: Optional question IDs (unused, for interface compatibility).
            **kwargs: Additional arguments (unused).

        Returns:
            Dictionary of calibration metrics with values:

                - "ece": Expected Calibration Error (lower is better, 0.0 is perfect)
                - "avg_confidence": Model's average confidence across all predictions
                - "overconfidence": avg_confidence - accuracy (positive means overconfident)

        Raises:
            ValueError: If reference_answers or confidence_scores is None.
        """

        if reference_answers is None:
            raise ValueError("MCQACalibration needs `reference_answers`.")
        if confidence_scores is None:
            raise ValueError("MCQACalibration needs `confidence_scores`.")

        # calculate ece
        valid_data = [
            (resp, ref, conf)
            for resp, ref, conf in zip(responses, reference_answers, confidence_scores)
            if conf is not None
        ]
        responses, answers, confidences = zip(*valid_data)
        confidences = np.array(confidences)
        accuracies = np.array([response == answer for response, answer in zip(responses, answers)], dtype=float)
        avg_confidence = float(np.mean(confidences))
        avg_accuracy = float(np.mean(accuracies))
        ece = self._calculate_ece(confidences, accuracies)

        return {
            "ece": ece,
            "avg_confidence": avg_confidence,
            "overconfidence": avg_confidence - avg_accuracy,
        }

    def _calculate_ece(self, confidences: np.ndarray, accuracies: np.ndarray) -> float:
        """Calculates Expected Calibration Error using binned confidence scores.

        ECE measures the difference between confidence and accuracy across confidence bins. For each bin, it computes
        the absolute difference between average confidence and average accuracy, weighted by the proportion of samples
        in that bin.

        Args:
            confidences: Array of confidence scores (0.0 to 1.0).
            accuracies: Array of binary accuracy values (0.0 or 1.0).

        Returns:
            Expected Calibration Error as a float between 0.0 and 1.0.
        """
        bin_boundaries = np.linspace(0, 1, self.n_bins + 1)
        ece = 0

        for i in range(self.n_bins):
            if i == self.n_bins - 1:
                in_bin = (confidences >= bin_boundaries[i]) & (confidences <= bin_boundaries[i + 1])
            else:
                in_bin = (confidences >= bin_boundaries[i]) & (confidences < bin_boundaries[i + 1])

            prop_in_bin = np.mean(in_bin)

            if prop_in_bin > 0:
                bin_accuracy = np.mean(accuracies[in_bin])
                bin_confidence = np.mean(confidences[in_bin])
                ece += prop_in_bin * abs(bin_confidence - bin_accuracy)

        return float(ece)
extras = extras instance-attribute
n_bins = n_bins instance-attribute
name = self.__class__.__name__ instance-attribute
compute(responses, reference_answers=None, confidence_scores=None, question_ids=None, **kwargs)

Computes calibration metrics for model predictions.

Parameters:

Name Type Description Default
responses list[str]

List of predicted answer choices (e.g., 'A', 'B', 'C', 'D').

required
reference_answers list[str]

List of correct answer choices.

None
confidence_scores list[float]

List of model confidence scores (0.0 to 1.0).

None
question_ids list[str] | None

Optional question IDs (unused, for interface compatibility).

None
**kwargs

Additional arguments (unused).

{}

Returns:

Type Description
dict[str, float]

Dictionary of calibration metrics with values:

  • "ece": Expected Calibration Error (lower is better, 0.0 is perfect)
  • "avg_confidence": Model's average confidence across all predictions
  • "overconfidence": avg_confidence - accuracy (positive means overconfident)

Raises:

Type Description
ValueError

If reference_answers or confidence_scores is None.

Source code in aisteer360/evaluation/metrics/custom/commonsense_mcqa/mcqa_calibration.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def compute(
    self,
    responses: list[str],
    reference_answers: list[str] = None,
    confidence_scores: list[float] = None,
    question_ids: list[str] | None = None,
    **kwargs
) -> dict[str, float]:
    """Computes calibration metrics for model predictions.

    Args:
        responses: List of predicted answer choices (e.g., 'A', 'B', 'C', 'D').
        reference_answers: List of correct answer choices.
        confidence_scores: List of model confidence scores (0.0 to 1.0).
        question_ids: Optional question IDs (unused, for interface compatibility).
        **kwargs: Additional arguments (unused).

    Returns:
        Dictionary of calibration metrics with values:

            - "ece": Expected Calibration Error (lower is better, 0.0 is perfect)
            - "avg_confidence": Model's average confidence across all predictions
            - "overconfidence": avg_confidence - accuracy (positive means overconfident)

    Raises:
        ValueError: If reference_answers or confidence_scores is None.
    """

    if reference_answers is None:
        raise ValueError("MCQACalibration needs `reference_answers`.")
    if confidence_scores is None:
        raise ValueError("MCQACalibration needs `confidence_scores`.")

    # calculate ece
    valid_data = [
        (resp, ref, conf)
        for resp, ref, conf in zip(responses, reference_answers, confidence_scores)
        if conf is not None
    ]
    responses, answers, confidences = zip(*valid_data)
    confidences = np.array(confidences)
    accuracies = np.array([response == answer for response, answer in zip(responses, answers)], dtype=float)
    avg_confidence = float(np.mean(confidences))
    avg_accuracy = float(np.mean(accuracies))
    ece = self._calculate_ece(confidences, accuracies)

    return {
        "ece": ece,
        "avg_confidence": avg_confidence,
        "overconfidence": avg_confidence - avg_accuracy,
    }

mcqa_positional_bias

MCQAPositionalBias

Bases: Metric

Positional bias metrics for multiple-choice QA.

Measures whether the model exhibits bias toward selecting certain answer positions.

Source code in aisteer360/evaluation/metrics/custom/commonsense_mcqa/mcqa_positional_bias.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class MCQAPositionalBias(Metric):
    """
    Positional bias metrics for multiple-choice QA.

    Measures whether the model exhibits bias toward selecting certain answer positions.
    """

    def compute(
        self,
        responses: list[str],
        prompts: list[str] | None = None,
        question_ids: list[str] | None = None,
        **kwargs
    ) -> dict[str, float]:
        """Computes positional bias metrics for model predictions.

        Calculates how much the model's choice frequencies deviate from uniform distribution across answer positions.
        For K answer choices, each position should ideally be selected 1/K of the time.

        Args:
            responses: List of predicted answer choices (e.g., 'A', 'B', 'C', 'D').
            prompts: List of question prompts (unused, for interface compatibility).
            question_ids: Optional question IDs for computing per-question bias variance.
            **kwargs: Additional arguments (unused).

        Returns:
            Dictionary of positional bias metrics with values:

                - "mean": Overall positional bias (mean |f_i - 1/K| across positions)
                - "std": Sample standard deviation of bias computed per question

        Note:

        - If question_ids is None, per-question analysis is skipped and std will be 0.0.
        """

        valid_responses = [r for r in responses if r is not None]

        position_counts = Counter(valid_responses)
        total_responses = len(valid_responses)
        positions = sorted(position_counts.keys())
        position_frequencies = [position_counts.get(pos, 0) / total_responses for pos in positions]
        expected_frequency = 1 / len(positions)

        # positional bias per question
        bias_per_question = []
        responses_by_question = defaultdict(list)

        for response, question_id in zip(responses, question_ids):
            if response is not None:
                responses_by_question[question_id].append(response)

        for question_id, question_responses in responses_by_question.items():
            if not question_responses:
                continue
            counts_for_question = Counter(question_responses)
            total_for_question = len(question_responses)
            frequencies_for_question = [counts_for_question.get(pos, 0) / total_for_question for pos in positions]
            bias_for_question = np.mean([abs(freq - expected_frequency) for freq in frequencies_for_question])
            bias_per_question.append(bias_for_question)

        return {
            "mean": np.mean([abs(freq - expected_frequency) for freq in position_frequencies]),
            "std": np.std(bias_per_question, ddof=1) if len(bias_per_question) > 1 else 0.0
        }
extras = extras instance-attribute
name = self.__class__.__name__ instance-attribute
compute(responses, prompts=None, question_ids=None, **kwargs)

Computes positional bias metrics for model predictions.

Calculates how much the model's choice frequencies deviate from uniform distribution across answer positions. For K answer choices, each position should ideally be selected 1/K of the time.

Parameters:

Name Type Description Default
responses list[str]

List of predicted answer choices (e.g., 'A', 'B', 'C', 'D').

required
prompts list[str] | None

List of question prompts (unused, for interface compatibility).

None
question_ids list[str] | None

Optional question IDs for computing per-question bias variance.

None
**kwargs

Additional arguments (unused).

{}

Returns:

Type Description
dict[str, float]

Dictionary of positional bias metrics with values:

  • "mean": Overall positional bias (mean |f_i - 1/K| across positions)
  • "std": Sample standard deviation of bias computed per question

Note:

  • If question_ids is None, per-question analysis is skipped and std will be 0.0.
Source code in aisteer360/evaluation/metrics/custom/commonsense_mcqa/mcqa_positional_bias.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def compute(
    self,
    responses: list[str],
    prompts: list[str] | None = None,
    question_ids: list[str] | None = None,
    **kwargs
) -> dict[str, float]:
    """Computes positional bias metrics for model predictions.

    Calculates how much the model's choice frequencies deviate from uniform distribution across answer positions.
    For K answer choices, each position should ideally be selected 1/K of the time.

    Args:
        responses: List of predicted answer choices (e.g., 'A', 'B', 'C', 'D').
        prompts: List of question prompts (unused, for interface compatibility).
        question_ids: Optional question IDs for computing per-question bias variance.
        **kwargs: Additional arguments (unused).

    Returns:
        Dictionary of positional bias metrics with values:

            - "mean": Overall positional bias (mean |f_i - 1/K| across positions)
            - "std": Sample standard deviation of bias computed per question

    Note:

    - If question_ids is None, per-question analysis is skipped and std will be 0.0.
    """

    valid_responses = [r for r in responses if r is not None]

    position_counts = Counter(valid_responses)
    total_responses = len(valid_responses)
    positions = sorted(position_counts.keys())
    position_frequencies = [position_counts.get(pos, 0) / total_responses for pos in positions]
    expected_frequency = 1 / len(positions)

    # positional bias per question
    bias_per_question = []
    responses_by_question = defaultdict(list)

    for response, question_id in zip(responses, question_ids):
        if response is not None:
            responses_by_question[question_id].append(response)

    for question_id, question_responses in responses_by_question.items():
        if not question_responses:
            continue
        counts_for_question = Counter(question_responses)
        total_for_question = len(question_responses)
        frequencies_for_question = [counts_for_question.get(pos, 0) / total_for_question for pos in positions]
        bias_for_question = np.mean([abs(freq - expected_frequency) for freq in frequencies_for_question])
        bias_per_question.append(bias_for_question)

    return {
        "mean": np.mean([abs(freq - expected_frequency) for freq in position_frequencies]),
        "std": np.std(bias_per_question, ddof=1) if len(bias_per_question) > 1 else 0.0
    }