Skip to content

Generic metrics

aisteer360.evaluation.metrics.generic

Generic evaluation metrics.

This module contains metrics that can be used for evaluating model outputs regardless of the specific task or domain (e.g., relevance, factuality, etc.).

factuality

Factuality

Bases: LLMJudgeMetric

Judge factual correctness of a response to a prompt.

Source code in aisteer360/evaluation/metrics/generic/factuality.py
19
20
21
22
23
24
25
26
27
28
29
30
class Factuality(LLMJudgeMetric):
    """
    Judge factual correctness of a response to a prompt.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(
            *args,
            prompt_template=_PROMPT,
            scale=(1, 5),
            **kwargs,
        )
base_prompt_template = prompt_template.strip() instance-attribute
batch_size = batch_size instance-attribute
device = device or ('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu') instance-attribute
extras = extras instance-attribute
format_instructions = self.output_parser.get_format_instructions() instance-attribute
max_retries = max_retries instance-attribute
model = AutoModelForCausalLM.from_pretrained(model_or_id) instance-attribute
name = self.__class__.__name__ instance-attribute
num_return_sequences = int(gen_kwargs.pop('num_return_sequences', 1)) instance-attribute
pipeline = TextGenerationPipeline(model=(self.model), tokenizer=(self.tokenizer)) instance-attribute
scale = scale instance-attribute
tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id) instance-attribute
use_chat = hasattr(self.tokenizer, 'apply_chat_template') and self.tokenizer.chat_template is not None instance-attribute
compute(responses, prompts=None, **kwargs)

Compute LLM judge scores for a list of responses.

Evaluates each response using the configured judge model and prompt template. Scores are averaged when multiple samples are generated per response (via num_return_sequences).

Parameters:

Name Type Description Default
responses list[str]

List of text responses to evaluate.

required
prompts list[str] | None

Optional list of prompts corresponding to each response. If provided, must be the same length as responses. These prompts can be referenced in the prompt_template using the {prompt} placeholder.

None
**kwargs Any

Additional keyword arguments (currently unused).

{}

Returns:

Type Description
dict[str, float | list[float]]

Score statistics containing:

  • "mean_score": Overall average score across all responses
  • "scores": List of mean scores for each response (averaged across samples)
  • "raw_scores": List of lists containing all individual scores for each response

Raises:

Type Description
AssertionError

If prompts is provided but has different length than responses.

Source code in aisteer360/evaluation/metrics/base_judge.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
@torch.inference_mode()
def compute(
    self,
    responses: list[str],
    prompts: list[str] | None = None,
    **kwargs: Any,
) -> dict[str, float | list[float]]:
    """Compute LLM judge scores for a list of responses.

    Evaluates each response using the configured judge model and prompt template. Scores are averaged when multiple
    samples are generated per response (via `num_return_sequences`).

    Args:
        responses (list[str]): List of text responses to evaluate.
        prompts (list[str] | None): Optional list of prompts corresponding to each response.
            If provided, must be the same length as responses. These prompts can be
            referenced in the prompt_template using the {prompt} placeholder.
        **kwargs: Additional keyword arguments (currently unused).

    Returns:
        Score statistics containing:

            - "mean_score": Overall average score across all responses
            - "scores": List of mean scores for each response (averaged across samples)
            - "raw_scores": List of lists containing all individual scores for each response

    Raises:
        AssertionError: If prompts is provided but has different length than responses.
    """

    if prompts is not None and len(prompts) != len(responses):
        raise AssertionError("`responses` and `prompts` must be the same length")

    # build prompts
    prompts_list: list[str] = []
    for i in range(len(responses)):
        fields: dict[str, str | float] = {
            "response": responses[i],
            "lower_bound": self.scale[0],
            "upper_bound": self.scale[1],
        }
        if prompts is not None:
            fields["prompt"] = prompts[i]

        prompt_core = self.base_prompt_template.format(**fields)
        prompt_formatted = self._wrap(prompt_core + "\n\n" + self.format_instructions)
        prompts_list.append(prompt_formatted)

    # generate
    prompt_scores: list[list[float]] = []
    for batch in self._batch_chunks(prompts_list, self.batch_size):
        outputs = self.pipeline(
            batch,
            num_return_sequences=self.num_return_sequences,
            return_full_text=False,
            clean_up_tokenization_spaces=True,
        )

        for prompt, generations in zip(batch, outputs):
            generations = generations if isinstance(generations, list) else [generations]
            assert len(generations) == self.num_return_sequences

            scores = []
            for generation in generations:
                reply_text = generation["generated_text"]
                try:
                    score = self.parse_fn(reply_text, self.scale)
                except Exception:
                    score = self._score_with_retries(prompt)
                scores.append(score)

            prompt_scores.append(scores)

    mean_per_prompt = [sum(prompt_score) / len(prompt_score) for prompt_score in prompt_scores]
    corpus_mean = sum(mean_per_prompt) / len(mean_per_prompt)

    return {
        "mean_score": corpus_mean,  # overall average
        "scores": mean_per_prompt,  # one number per original prompt
        "raw_scores": prompt_scores  # n_samples scores per prompt
    }

perplexity

Perplexity

Bases: Metric

Compute token-level perplexity for a batch of sentences.

Perplexity is the exponentiated mean cross-entropy between the language model’s predicted distribution and the true next token. Lower is better.

Parameters:

Name Type Description Default
model_or_id str | Module

Hugging Face model ID or an already-instantiated causal language model.

required
tokenizer PreTrainedTokenizer | None

Tokenizer to use. Leave None when passing a model ID to automatically load the matching tokenizer. Defaults to None.

None
batch_size int

Number of sentences per forward pass. Higher is faster until GPU memory becomes the bottleneck. Defaults to 16.

16
add_bos bool

Whether to prepend the tokenizer’s BOS token so the first word in each sentence is also scored. Ignored if the tokenizer has no BOS token. Defaults to True.

True
max_length int | None

If set, truncate inputs to this length so they fit the model’s context window. None disables truncation. Defaults to None.

None
device str | None

"cuda" or "cpu". When None, automatically selects GPU if available. Defaults to None.

None

Attributes:

Name Type Description
add_bos bool

Whether a BOS token is prepended before scoring.

batch_size int

Number of sentences processed per forward pass.

device str

The device actually selected for computation ("cuda" or "cpu").

max_length int | None

Truncation length for inputs, or None for no truncation.

model PreTrainedModel

The loaded causal language model used to score tokens.

tokenizer PreTrainedTokenizer

Tokenizer used for encoding, padding, and BOS handling.

Source code in aisteer360/evaluation/metrics/generic/perplexity.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
class Perplexity(Metric):
    """Compute token-level perplexity for a batch of sentences.

    Perplexity is the exponentiated mean cross-entropy between the language model’s predicted distribution and the true
    next token. Lower is better.

    Args:
        model_or_id (str | torch.nn.Module): Hugging Face model ID or an already-instantiated causal language model.
        tokenizer (transformers.PreTrainedTokenizer | None, optional):
            Tokenizer to use.  Leave ``None`` when passing a model ID to automatically load the matching tokenizer.
            Defaults to ``None``.
        batch_size (int, optional): Number of sentences per forward pass. Higher is faster until GPU memory becomes the
            bottleneck. Defaults to ``16``.
        add_bos (bool, optional): Whether to prepend the tokenizer’s BOS token so the first word in each sentence is
            also scored. Ignored if the tokenizer has no BOS token. Defaults to ``True``.
        max_length (int | None, optional): If set, truncate inputs to this length so they fit the model’s context
            window. ``None`` disables truncation. Defaults to ``None``.
        device (str | None, optional): ``"cuda"`` or ``"cpu"``. When ``None``, automatically selects GPU if available.
            Defaults to ``None``.

    Attributes:
        add_bos (bool): Whether a BOS token is prepended before scoring.
        batch_size (int): Number of sentences processed per forward pass.
        device (str): The device actually selected for computation (``"cuda"`` or ``"cpu"``).
        max_length (int | None): Truncation length for inputs, or ``None`` for no truncation.
        model (transformers.PreTrainedModel): The loaded causal language model used to score tokens.
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer used for encoding, padding, and BOS handling.
    """

    def __init__(
        self,
        model_or_id: str | torch.nn.Module,
        tokenizer: Any | None = None,
        batch_size: int = 16,
        add_bos: bool = True,
        max_length: int | None = None,
        device: str | None = None,
    ):
        super().__init__()

        if isinstance(model_or_id, str):
            self.model = AutoModelForCausalLM.from_pretrained(model_or_id)
            self.tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id)
        else:  # model object
            self.model = model_or_id
            self.tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id.config._name_or_path)

        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device).eval()
        self.batch_size = batch_size
        self.add_bos = add_bos and (self.tokenizer.bos_token_id is not None)
        self.max_length = max_length

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = (
                self.tokenizer.eos_token
                or self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
            )

    @torch.no_grad()
    def compute(
        self,
        responses: list[str],
        prompts: list[str] | None = None,
    ) -> dict[str, float]:
        """Compute perplexity for each response (and the mean across the batch).

        Args:
            responses (list[str]): Text sequences to score.
            prompts (list[str] | None, optional): Unused here; present for a uniform metric API.

        Returns:
            dict[str, float]: A dict with keys:

                - ``"mean_perplexity"``: mean perplexity over all inputs.
                - ``"perplexities"``: list of per-sample perplexities in input order.
        """
        perplexities: list[float] = []
        local_batch_size = self.batch_size

        for i in range(0, len(responses), local_batch_size):
            batch = responses[i : i + local_batch_size]

            encoding = self.tokenizer(
                batch,
                padding=True,
                truncation=self.max_length is not None,
                max_length=self.max_length,
                add_special_tokens=False,
                return_tensors="pt",
            ).to(self.device)
            input_ids = encoding["input_ids"]

            if self.add_bos:
                bos_tokens = torch.full(
                    (input_ids.size(0), 1),
                    self.tokenizer.bos_token_id,
                    device=self.device,
                )
                input_ids = torch.cat([bos_tokens, input_ids], dim=1)

            logits = self.model(input_ids).logits[:, :-1]
            labels = input_ids[:, 1:]

            loss_per_token = F.cross_entropy(
                logits.reshape(-1, logits.size(-1)),
                labels.reshape(-1),
                reduction="none",
            ).view(labels.size())

            mask = labels.ne(self.tokenizer.pad_token_id)
            seq_loss = (loss_per_token * mask).sum(1) / mask.sum(1)

            perplexities.extend(torch.exp(seq_loss).cpu().tolist())

        return {
            "mean_perplexity": sum(perplexities) / len(perplexities),
            "perplexities": perplexities,
        }
add_bos = add_bos and self.tokenizer.bos_token_id is not None instance-attribute
batch_size = batch_size instance-attribute
device = device or ('cuda' if torch.cuda.is_available() else 'cpu') instance-attribute
extras = extras instance-attribute
max_length = max_length instance-attribute
model = AutoModelForCausalLM.from_pretrained(model_or_id) instance-attribute
name = self.__class__.__name__ instance-attribute
tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id) instance-attribute
compute(responses, prompts=None)

Compute perplexity for each response (and the mean across the batch).

Parameters:

Name Type Description Default
responses list[str]

Text sequences to score.

required
prompts list[str] | None

Unused here; present for a uniform metric API.

None

Returns:

Type Description
dict[str, float]

dict[str, float]: A dict with keys:

  • "mean_perplexity": mean perplexity over all inputs.
  • "perplexities": list of per-sample perplexities in input order.
Source code in aisteer360/evaluation/metrics/generic/perplexity.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
@torch.no_grad()
def compute(
    self,
    responses: list[str],
    prompts: list[str] | None = None,
) -> dict[str, float]:
    """Compute perplexity for each response (and the mean across the batch).

    Args:
        responses (list[str]): Text sequences to score.
        prompts (list[str] | None, optional): Unused here; present for a uniform metric API.

    Returns:
        dict[str, float]: A dict with keys:

            - ``"mean_perplexity"``: mean perplexity over all inputs.
            - ``"perplexities"``: list of per-sample perplexities in input order.
    """
    perplexities: list[float] = []
    local_batch_size = self.batch_size

    for i in range(0, len(responses), local_batch_size):
        batch = responses[i : i + local_batch_size]

        encoding = self.tokenizer(
            batch,
            padding=True,
            truncation=self.max_length is not None,
            max_length=self.max_length,
            add_special_tokens=False,
            return_tensors="pt",
        ).to(self.device)
        input_ids = encoding["input_ids"]

        if self.add_bos:
            bos_tokens = torch.full(
                (input_ids.size(0), 1),
                self.tokenizer.bos_token_id,
                device=self.device,
            )
            input_ids = torch.cat([bos_tokens, input_ids], dim=1)

        logits = self.model(input_ids).logits[:, :-1]
        labels = input_ids[:, 1:]

        loss_per_token = F.cross_entropy(
            logits.reshape(-1, logits.size(-1)),
            labels.reshape(-1),
            reduction="none",
        ).view(labels.size())

        mask = labels.ne(self.tokenizer.pad_token_id)
        seq_loss = (loss_per_token * mask).sum(1) / mask.sum(1)

        perplexities.extend(torch.exp(seq_loss).cpu().tolist())

    return {
        "mean_perplexity": sum(perplexities) / len(perplexities),
        "perplexities": perplexities,
    }

relevance

Relevance

Bases: LLMJudgeMetric

Judge relevance of a response to a prompt.

Source code in aisteer360/evaluation/metrics/generic/relevance.py
19
20
21
22
23
24
25
26
27
28
29
30
class Relevance(LLMJudgeMetric):
    """
    Judge relevance of a response to a prompt.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(
            *args,
            prompt_template=_PROMPT,
            scale=(1, 5),
            **kwargs,
        )
base_prompt_template = prompt_template.strip() instance-attribute
batch_size = batch_size instance-attribute
device = device or ('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu') instance-attribute
extras = extras instance-attribute
format_instructions = self.output_parser.get_format_instructions() instance-attribute
max_retries = max_retries instance-attribute
model = AutoModelForCausalLM.from_pretrained(model_or_id) instance-attribute
name = self.__class__.__name__ instance-attribute
num_return_sequences = int(gen_kwargs.pop('num_return_sequences', 1)) instance-attribute
pipeline = TextGenerationPipeline(model=(self.model), tokenizer=(self.tokenizer)) instance-attribute
scale = scale instance-attribute
tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id) instance-attribute
use_chat = hasattr(self.tokenizer, 'apply_chat_template') and self.tokenizer.chat_template is not None instance-attribute
compute(responses, prompts=None, **kwargs)

Compute LLM judge scores for a list of responses.

Evaluates each response using the configured judge model and prompt template. Scores are averaged when multiple samples are generated per response (via num_return_sequences).

Parameters:

Name Type Description Default
responses list[str]

List of text responses to evaluate.

required
prompts list[str] | None

Optional list of prompts corresponding to each response. If provided, must be the same length as responses. These prompts can be referenced in the prompt_template using the {prompt} placeholder.

None
**kwargs Any

Additional keyword arguments (currently unused).

{}

Returns:

Type Description
dict[str, float | list[float]]

Score statistics containing:

  • "mean_score": Overall average score across all responses
  • "scores": List of mean scores for each response (averaged across samples)
  • "raw_scores": List of lists containing all individual scores for each response

Raises:

Type Description
AssertionError

If prompts is provided but has different length than responses.

Source code in aisteer360/evaluation/metrics/base_judge.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
@torch.inference_mode()
def compute(
    self,
    responses: list[str],
    prompts: list[str] | None = None,
    **kwargs: Any,
) -> dict[str, float | list[float]]:
    """Compute LLM judge scores for a list of responses.

    Evaluates each response using the configured judge model and prompt template. Scores are averaged when multiple
    samples are generated per response (via `num_return_sequences`).

    Args:
        responses (list[str]): List of text responses to evaluate.
        prompts (list[str] | None): Optional list of prompts corresponding to each response.
            If provided, must be the same length as responses. These prompts can be
            referenced in the prompt_template using the {prompt} placeholder.
        **kwargs: Additional keyword arguments (currently unused).

    Returns:
        Score statistics containing:

            - "mean_score": Overall average score across all responses
            - "scores": List of mean scores for each response (averaged across samples)
            - "raw_scores": List of lists containing all individual scores for each response

    Raises:
        AssertionError: If prompts is provided but has different length than responses.
    """

    if prompts is not None and len(prompts) != len(responses):
        raise AssertionError("`responses` and `prompts` must be the same length")

    # build prompts
    prompts_list: list[str] = []
    for i in range(len(responses)):
        fields: dict[str, str | float] = {
            "response": responses[i],
            "lower_bound": self.scale[0],
            "upper_bound": self.scale[1],
        }
        if prompts is not None:
            fields["prompt"] = prompts[i]

        prompt_core = self.base_prompt_template.format(**fields)
        prompt_formatted = self._wrap(prompt_core + "\n\n" + self.format_instructions)
        prompts_list.append(prompt_formatted)

    # generate
    prompt_scores: list[list[float]] = []
    for batch in self._batch_chunks(prompts_list, self.batch_size):
        outputs = self.pipeline(
            batch,
            num_return_sequences=self.num_return_sequences,
            return_full_text=False,
            clean_up_tokenization_spaces=True,
        )

        for prompt, generations in zip(batch, outputs):
            generations = generations if isinstance(generations, list) else [generations]
            assert len(generations) == self.num_return_sequences

            scores = []
            for generation in generations:
                reply_text = generation["generated_text"]
                try:
                    score = self.parse_fn(reply_text, self.scale)
                except Exception:
                    score = self._score_with_retries(prompt)
                scores.append(score)

            prompt_scores.append(scores)

    mean_per_prompt = [sum(prompt_score) / len(prompt_score) for prompt_score in prompt_scores]
    corpus_mean = sum(mean_per_prompt) / len(mean_per_prompt)

    return {
        "mean_score": corpus_mean,  # overall average
        "scores": mean_per_prompt,  # one number per original prompt
        "raw_scores": prompt_scores  # n_samples scores per prompt
    }