Generic metrics

`aisteer360.evaluation.metrics.generic`

Generic evaluation metrics.

This module contains metrics that can be used for evaluating model outputs regardless of the specific task or domain (e.g., relevance, factuality, etc.).

`factuality`

`Factuality`

Bases: LLMJudgeMetric

Judge factual correctness of a response to a prompt.

Source code in aisteer360/evaluation/metrics/generic/factuality.py

class Factuality(LLMJudgeMetric):
    """
    Judge factual correctness of a response to a prompt.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(
            *args,
            prompt_template=_PROMPT,
            scale=(1, 5),
            **kwargs,
        )

`base_prompt_template = prompt_template.strip()` `instance-attribute`

`batch_size = batch_size` `instance-attribute`

`device = device or ('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')` `instance-attribute`

`extras = extras` `instance-attribute`

`format_instructions = self.output_parser.get_format_instructions()` `instance-attribute`

`max_retries = max_retries` `instance-attribute`

`model = AutoModelForCausalLM.from_pretrained(model_or_id)` `instance-attribute`

`name = self.class.name` `instance-attribute`

`num_return_sequences = int(gen_kwargs.pop('num_return_sequences', 1))` `instance-attribute`

`pipeline = TextGenerationPipeline(model=(self.model), tokenizer=(self.tokenizer))` `instance-attribute`

`scale = scale` `instance-attribute`

`tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id)` `instance-attribute`

`use_chat = hasattr(self.tokenizer, 'apply_chat_template') and self.tokenizer.chat_template is not None` `instance-attribute`

`compute(responses, prompts=None, **kwargs)`

Compute LLM judge scores for a list of responses.

Evaluates each response using the configured judge model and prompt template. Scores are averaged when multiple samples are generated per response (via num_return_sequences).

Parameters:

Name	Type	Description	Default
`responses`	`list[str]`	List of text responses to evaluate.	required
`prompts`	`list[str] \| None`	Optional list of prompts corresponding to each response. If provided, must be the same length as responses. These prompts can be referenced in the prompt_template using the {prompt} placeholder.	`None`
`**kwargs`	`Any`	Additional keyword arguments (currently unused).	`{}`

Returns:

Type	Description
`dict[str, float \| list[float]]`	Score statistics containing: "mean_score": Overall average score across all responses "scores": List of mean scores for each response (averaged across samples) "raw_scores": List of lists containing all individual scores for each response

Raises:

Type	Description
`AssertionError`	If prompts is provided but has different length than responses.

Source code in aisteer360/evaluation/metrics/base_judge.py

@torch.inference_mode()
def compute(
    self,
    responses: list[str],
    prompts: list[str] | None = None,
    **kwargs: Any,
) -> dict[str, float | list[float]]:
    """Compute LLM judge scores for a list of responses.

    Evaluates each response using the configured judge model and prompt template. Scores are averaged when multiple
    samples are generated per response (via `num_return_sequences`).

    Args:
        responses (list[str]): List of text responses to evaluate.
        prompts (list[str] | None): Optional list of prompts corresponding to each response.
            If provided, must be the same length as responses. These prompts can be
            referenced in the prompt_template using the {prompt} placeholder.
        **kwargs: Additional keyword arguments (currently unused).

    Returns:
        Score statistics containing:

            - "mean_score": Overall average score across all responses
            - "scores": List of mean scores for each response (averaged across samples)
            - "raw_scores": List of lists containing all individual scores for each response

    Raises:
        AssertionError: If prompts is provided but has different length than responses.
    """

    if prompts is not None and len(prompts) != len(responses):
        raise AssertionError("`responses` and `prompts` must be the same length")

    # build prompts
    prompts_list: list[str] = []
    for i in range(len(responses)):
        fields: dict[str, str | float] = {
            "response": responses[i],
            "lower_bound": self.scale[0],
            "upper_bound": self.scale[1],
        }
        if prompts is not None:
            fields["prompt"] = prompts[i]

        prompt_core = self.base_prompt_template.format(**fields)
        prompt_formatted = self._wrap(prompt_core + "\n\n" + self.format_instructions)
        prompts_list.append(prompt_formatted)

    # generate
    prompt_scores: list[list[float]] = []
    for batch in self._batch_chunks(prompts_list, self.batch_size):
        outputs = self.pipeline(
            batch,
            num_return_sequences=self.num_return_sequences,
            return_full_text=False,
            clean_up_tokenization_spaces=True,
        )

        for prompt, generations in zip(batch, outputs):
            generations = generations if isinstance(generations, list) else [generations]
            assert len(generations) == self.num_return_sequences

            scores = []
            for generation in generations:
                reply_text = generation["generated_text"]
                try:
                    score = self.parse_fn(reply_text, self.scale)
                except Exception:
                    score = self._score_with_retries(prompt)
                scores.append(score)

            prompt_scores.append(scores)

    mean_per_prompt = [sum(prompt_score) / len(prompt_score) for prompt_score in prompt_scores]
    corpus_mean = sum(mean_per_prompt) / len(mean_per_prompt)

    return {
        "mean_score": corpus_mean,  # overall average
        "scores": mean_per_prompt,  # one number per original prompt
        "raw_scores": prompt_scores  # n_samples scores per prompt
    }

`perplexity`

`Perplexity`

Bases: Metric

Compute token-level perplexity for a batch of sentences.

Perplexity is the exponentiated mean cross-entropy between the language model’s predicted distribution and the true next token. Lower is better.

Parameters:

Name	Type	Description	Default
`model_or_id`	`str \| Module`	Hugging Face model ID or an already-instantiated causal language model.	required
`tokenizer`	`PreTrainedTokenizer \| None`	Tokenizer to use. Leave `None` when passing a model ID to automatically load the matching tokenizer. Defaults to `None`.	`None`
`batch_size`	`int`	Number of sentences per forward pass. Higher is faster until GPU memory becomes the bottleneck. Defaults to `16`.	`16`
`add_bos`	`bool`	Whether to prepend the tokenizer’s BOS token so the first word in each sentence is also scored. Ignored if the tokenizer has no BOS token. Defaults to `True`.	`True`
`max_length`	`int \| None`	If set, truncate inputs to this length so they fit the model’s context window. `None` disables truncation. Defaults to `None`.	`None`
`device`	`str \| None`	`"cuda"` or `"cpu"`. When `None`, automatically selects GPU if available. Defaults to `None`.	`None`

Attributes:

Name	Type	Description
`add_bos`	`bool`	Whether a BOS token is prepended before scoring.
`batch_size`	`int`	Number of sentences processed per forward pass.
`device`	`str`	The device actually selected for computation (`"cuda"` or `"cpu"`).
`max_length`	`int \| None`	Truncation length for inputs, or `None` for no truncation.
`model`	`PreTrainedModel`	The loaded causal language model used to score tokens.
`tokenizer`	`PreTrainedTokenizer`	Tokenizer used for encoding, padding, and BOS handling.

Source code in aisteer360/evaluation/metrics/generic/perplexity.py

class Perplexity(Metric):
    """Compute token-level perplexity for a batch of sentences.

    Perplexity is the exponentiated mean cross-entropy between the language model’s predicted distribution and the true
    next token. Lower is better.

    Args:
        model_or_id (str | torch.nn.Module): Hugging Face model ID or an already-instantiated causal language model.
        tokenizer (transformers.PreTrainedTokenizer | None, optional):
            Tokenizer to use.  Leave ``None`` when passing a model ID to automatically load the matching tokenizer.
            Defaults to ``None``.
        batch_size (int, optional): Number of sentences per forward pass. Higher is faster until GPU memory becomes the
            bottleneck. Defaults to ``16``.
        add_bos (bool, optional): Whether to prepend the tokenizer’s BOS token so the first word in each sentence is
            also scored. Ignored if the tokenizer has no BOS token. Defaults to ``True``.
        max_length (int | None, optional): If set, truncate inputs to this length so they fit the model’s context
            window. ``None`` disables truncation. Defaults to ``None``.
        device (str | None, optional): ``"cuda"`` or ``"cpu"``. When ``None``, automatically selects GPU if available.
            Defaults to ``None``.

    Attributes:
        add_bos (bool): Whether a BOS token is prepended before scoring.
        batch_size (int): Number of sentences processed per forward pass.
        device (str): The device actually selected for computation (``"cuda"`` or ``"cpu"``).
        max_length (int | None): Truncation length for inputs, or ``None`` for no truncation.
        model (transformers.PreTrainedModel): The loaded causal language model used to score tokens.
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer used for encoding, padding, and BOS handling.
    """

    def __init__(
        self,
        model_or_id: str | torch.nn.Module,
        tokenizer: Any | None = None,
        batch_size: int = 16,
        add_bos: bool = True,
        max_length: int | None = None,
        device: str | None = None,
    ):
        super().__init__()

        if isinstance(model_or_id, str):
            self.model = AutoModelForCausalLM.from_pretrained(model_or_id)
            self.tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id)
        else:  # model object
            self.model = model_or_id
            self.tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id.config._name_or_path)

        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device).eval()
        self.batch_size = batch_size
        self.add_bos = add_bos and (self.tokenizer.bos_token_id is not None)
        self.max_length = max_length

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = (
                self.tokenizer.eos_token
                or self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
            )

    @torch.no_grad()
    def compute(
        self,
        responses: list[str],
        prompts: list[str] | None = None,
    ) -> dict[str, float]:
        """Compute perplexity for each response (and the mean across the batch).

        Args:
            responses (list[str]): Text sequences to score.
            prompts (list[str] | None, optional): Unused here; present for a uniform metric API.

        Returns:
            dict[str, float]: A dict with keys:

                - ``"mean_perplexity"``: mean perplexity over all inputs.
                - ``"perplexities"``: list of per-sample perplexities in input order.
        """
        perplexities: list[float] = []
        local_batch_size = self.batch_size

        for i in range(0, len(responses), local_batch_size):
            batch = responses[i : i + local_batch_size]

            encoding = self.tokenizer(
                batch,
                padding=True,
                truncation=self.max_length is not None,
                max_length=self.max_length,
                add_special_tokens=False,
                return_tensors="pt",
            ).to(self.device)
            input_ids = encoding["input_ids"]

            if self.add_bos:
                bos_tokens = torch.full(
                    (input_ids.size(0), 1),
                    self.tokenizer.bos_token_id,
                    device=self.device,
                )
                input_ids = torch.cat([bos_tokens, input_ids], dim=1)

            logits = self.model(input_ids).logits[:, :-1]
            labels = input_ids[:, 1:]

            loss_per_token = F.cross_entropy(
                logits.reshape(-1, logits.size(-1)),
                labels.reshape(-1),
                reduction="none",
            ).view(labels.size())

            mask = labels.ne(self.tokenizer.pad_token_id)
            seq_loss = (loss_per_token * mask).sum(1) / mask.sum(1)

            perplexities.extend(torch.exp(seq_loss).cpu().tolist())

        return {
            "mean_perplexity": sum(perplexities) / len(perplexities),
            "perplexities": perplexities,
        }

`add_bos = add_bos and self.tokenizer.bos_token_id is not None` `instance-attribute`

`batch_size = batch_size` `instance-attribute`

`device = device or ('cuda' if torch.cuda.is_available() else 'cpu')` `instance-attribute`

`extras = extras` `instance-attribute`

`max_length = max_length` `instance-attribute`

`model = AutoModelForCausalLM.from_pretrained(model_or_id)` `instance-attribute`

`name = self.class.name` `instance-attribute`

`tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id)` `instance-attribute`

`compute(responses, prompts=None)`

Compute perplexity for each response (and the mean across the batch).

Parameters:

Name	Type	Description	Default
`responses`	`list[str]`	Text sequences to score.	required
`prompts`	`list[str] \| None`	Unused here; present for a uniform metric API.	`None`

Returns:

Type	Description
`dict[str, float]`	dict[str, float]: A dict with keys: `"mean_perplexity"`: mean perplexity over all inputs. `"perplexities"`: list of per-sample perplexities in input order.

Source code in aisteer360/evaluation/metrics/generic/perplexity.py

@torch.no_grad()
def compute(
    self,
    responses: list[str],
    prompts: list[str] | None = None,
) -> dict[str, float]:
    """Compute perplexity for each response (and the mean across the batch).

    Args:
        responses (list[str]): Text sequences to score.
        prompts (list[str] | None, optional): Unused here; present for a uniform metric API.

    Returns:
        dict[str, float]: A dict with keys:

            - ``"mean_perplexity"``: mean perplexity over all inputs.
            - ``"perplexities"``: list of per-sample perplexities in input order.
    """
    perplexities: list[float] = []
    local_batch_size = self.batch_size

    for i in range(0, len(responses), local_batch_size):
        batch = responses[i : i + local_batch_size]

        encoding = self.tokenizer(
            batch,
            padding=True,
            truncation=self.max_length is not None,
            max_length=self.max_length,
            add_special_tokens=False,
            return_tensors="pt",
        ).to(self.device)
        input_ids = encoding["input_ids"]

        if self.add_bos:
            bos_tokens = torch.full(
                (input_ids.size(0), 1),
                self.tokenizer.bos_token_id,
                device=self.device,
            )
            input_ids = torch.cat([bos_tokens, input_ids], dim=1)

        logits = self.model(input_ids).logits[:, :-1]
        labels = input_ids[:, 1:]

        loss_per_token = F.cross_entropy(
            logits.reshape(-1, logits.size(-1)),
            labels.reshape(-1),
            reduction="none",
        ).view(labels.size())

        mask = labels.ne(self.tokenizer.pad_token_id)
        seq_loss = (loss_per_token * mask).sum(1) / mask.sum(1)

        perplexities.extend(torch.exp(seq_loss).cpu().tolist())

    return {
        "mean_perplexity": sum(perplexities) / len(perplexities),
        "perplexities": perplexities,
    }

`relevance`

`Relevance`

Bases: LLMJudgeMetric

Judge relevance of a response to a prompt.

Source code in aisteer360/evaluation/metrics/generic/relevance.py

class Relevance(LLMJudgeMetric):
    """
    Judge relevance of a response to a prompt.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(
            *args,
            prompt_template=_PROMPT,
            scale=(1, 5),
            **kwargs,
        )

`base_prompt_template = prompt_template.strip()` `instance-attribute`

`batch_size = batch_size` `instance-attribute`

`device = device or ('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')` `instance-attribute`

`extras = extras` `instance-attribute`

`format_instructions = self.output_parser.get_format_instructions()` `instance-attribute`

`max_retries = max_retries` `instance-attribute`

`model = AutoModelForCausalLM.from_pretrained(model_or_id)` `instance-attribute`

`name = self.class.name` `instance-attribute`

`num_return_sequences = int(gen_kwargs.pop('num_return_sequences', 1))` `instance-attribute`

`pipeline = TextGenerationPipeline(model=(self.model), tokenizer=(self.tokenizer))` `instance-attribute`

`scale = scale` `instance-attribute`

`tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id)` `instance-attribute`

`use_chat = hasattr(self.tokenizer, 'apply_chat_template') and self.tokenizer.chat_template is not None` `instance-attribute`

`compute(responses, prompts=None, **kwargs)`

Compute LLM judge scores for a list of responses.

Evaluates each response using the configured judge model and prompt template. Scores are averaged when multiple samples are generated per response (via num_return_sequences).

Parameters:

Name	Type	Description	Default
`responses`	`list[str]`	List of text responses to evaluate.	required
`prompts`	`list[str] \| None`	Optional list of prompts corresponding to each response. If provided, must be the same length as responses. These prompts can be referenced in the prompt_template using the {prompt} placeholder.	`None`
`**kwargs`	`Any`	Additional keyword arguments (currently unused).	`{}`

Returns:

Type	Description
`dict[str, float \| list[float]]`	Score statistics containing: "mean_score": Overall average score across all responses "scores": List of mean scores for each response (averaged across samples) "raw_scores": List of lists containing all individual scores for each response

Raises:

Type	Description
`AssertionError`	If prompts is provided but has different length than responses.

Source code in aisteer360/evaluation/metrics/base_judge.py

@torch.inference_mode()
def compute(
    self,
    responses: list[str],
    prompts: list[str] | None = None,
    **kwargs: Any,
) -> dict[str, float | list[float]]:
    """Compute LLM judge scores for a list of responses.

    Evaluates each response using the configured judge model and prompt template. Scores are averaged when multiple
    samples are generated per response (via `num_return_sequences`).

    Args:
        responses (list[str]): List of text responses to evaluate.
        prompts (list[str] | None): Optional list of prompts corresponding to each response.
            If provided, must be the same length as responses. These prompts can be
            referenced in the prompt_template using the {prompt} placeholder.
        **kwargs: Additional keyword arguments (currently unused).

    Returns:
        Score statistics containing:

            - "mean_score": Overall average score across all responses
            - "scores": List of mean scores for each response (averaged across samples)
            - "raw_scores": List of lists containing all individual scores for each response

    Raises:
        AssertionError: If prompts is provided but has different length than responses.
    """

    if prompts is not None and len(prompts) != len(responses):
        raise AssertionError("`responses` and `prompts` must be the same length")

    # build prompts
    prompts_list: list[str] = []
    for i in range(len(responses)):
        fields: dict[str, str | float] = {
            "response": responses[i],
            "lower_bound": self.scale[0],
            "upper_bound": self.scale[1],
        }
        if prompts is not None:
            fields["prompt"] = prompts[i]

        prompt_core = self.base_prompt_template.format(**fields)
        prompt_formatted = self._wrap(prompt_core + "\n\n" + self.format_instructions)
        prompts_list.append(prompt_formatted)

    # generate
    prompt_scores: list[list[float]] = []
    for batch in self._batch_chunks(prompts_list, self.batch_size):
        outputs = self.pipeline(
            batch,
            num_return_sequences=self.num_return_sequences,
            return_full_text=False,
            clean_up_tokenization_spaces=True,
        )

        for prompt, generations in zip(batch, outputs):
            generations = generations if isinstance(generations, list) else [generations]
            assert len(generations) == self.num_return_sequences

            scores = []
            for generation in generations:
                reply_text = generation["generated_text"]
                try:
                    score = self.parse_fn(reply_text, self.scale)
                except Exception:
                    score = self._score_with_retries(prompt)
                scores.append(score)

            prompt_scores.append(scores)

    mean_per_prompt = [sum(prompt_score) / len(prompt_score) for prompt_score in prompt_scores]
    corpus_mean = sum(mean_per_prompt) / len(mean_per_prompt)

    return {
        "mean_score": corpus_mean,  # overall average
        "scores": mean_per_prompt,  # one number per original prompt
        "raw_scores": prompt_scores  # n_samples scores per prompt
    }

`reward_score`

`RewardScore`

Bases: Metric

Compute (pointwise) reward scores using a pretrained reward model.

This metric expects a Hugging Face sequence-classification model. The typical case for reward models is num_labels == 1, where the single logit is taken as the reward. If num_labels > 1, you can select a class index and/or apply a probability transform.

Parameters:

Name	Type	Description	Default
`model_or_id`	`str \| PreTrainedModel`	HF model id (str) or an already-instantiated `PreTrainedModel` (sequence-classification head).	required
`tokenizer`	`PreTrainedTokenizerBase \| None`	Optional tokenizer. If None, loaded from `model_or_id`.	`None`
`device`	`str \| None`	'cuda' \| 'mps' \| 'cpu'. Defaults to an available accelerator.	`None`
`batch_size`	`int`	Batch size for scoring.	`8`
`max_length`	`int \| None`	Truncation length for encoding. If None, no truncation.	`1024`
`score_transform`	`Literal['identity', 'sigmoid', 'softmax', 'log_softmax']`	How to map logits to a scalar: - 'identity' -> use raw logit (default; good for num_labels==1) - 'sigmoid' -> sigmoid(logit) in [0,1] (num_labels==1) - 'softmax' -> softmax(logits)[label_index] - 'log_softmax'-> log_softmax(logits)[label_index]	`'identity'`
`label_index`	`int`	Class index to select when `num_labels > 1`.	`0`
`return_logits`	`bool`	If True, also return raw logits per sample (for debugging).	`False`

Notes:

- If your reward model was trained to take both prompt and response, pass `prompts=[...]`. If not, omit `prompts` and only responses are encoded.
- To add pairwise comparisons, compute two calls (candidate vs. baseline) and take the difference externally, or extend this class to accept a
  `reference_responses` kwarg and return margins.

Source code in aisteer360/evaluation/metrics/generic/reward_score.py

class RewardScore(Metric):
    """
    Compute (pointwise) reward scores using a pretrained reward model.

    This metric expects a Hugging Face sequence-classification model. The typical case for reward models is
    `num_labels == 1`, where the single logit is taken as the reward. If `num_labels > 1`, you can select a class index
    and/or apply a probability transform.

    Args:
        model_or_id: HF model id (str) or an already-instantiated
            `PreTrainedModel` (sequence-classification head).
        tokenizer: Optional tokenizer. If None, loaded from `model_or_id`.
        device: 'cuda' | 'mps' | 'cpu'. Defaults to an available accelerator.
        batch_size: Batch size for scoring.
        max_length: Truncation length for encoding. If None, no truncation.
        score_transform: How to map logits to a scalar:
            - 'identity' -> use raw logit (default; good for num_labels==1)
            - 'sigmoid' -> sigmoid(logit) in [0,1] (num_labels==1)
            - 'softmax' -> softmax(logits)[label_index]
            - 'log_softmax'-> log_softmax(logits)[label_index]
        label_index: Class index to select when `num_labels > 1`.
        return_logits: If True, also return raw logits per sample (for debugging).

    Notes:

        - If your reward model was trained to take both prompt and response, pass `prompts=[...]`. If not, omit `prompts` and only responses are encoded.
        - To add pairwise comparisons, compute two calls (candidate vs. baseline) and take the difference externally, or extend this class to accept a
          `reference_responses` kwarg and return margins.
    """

    def __init__(
        self,
        model_or_id: str | PreTrainedModel,
        tokenizer: PreTrainedTokenizerBase | None = None,
        device: str | None = None,
        batch_size: int = 8,
        max_length: int | None = 1024,
        score_transform: Literal["identity", "sigmoid", "softmax", "log_softmax"] = "identity",
        label_index: int = 0,
        return_logits: bool = False,
        **extras: Any,
    ) -> None:
        super().__init__(**extras)

        # load model/tokenizer
        if isinstance(model_or_id, PreTrainedModel):
            self.model: PreTrainedModel = model_or_id
            if tokenizer is None:
                raise ValueError("If passing a model instance, you must also pass its tokenizer.")
            self.tokenizer = tokenizer
        else:
            self.model = AutoModelForSequenceClassification.from_pretrained(model_or_id)
            self.tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id)

        # device selection mirrors the base judge/perplexity defaults
        self.device = device or (
            "cuda" if torch.cuda.is_available()
            else "mps" if torch.backends.mps.is_available()
            else "cpu"
        )
        self.model.to(self.device).eval()

        self.batch_size = int(batch_size)
        self.max_length = max_length
        self.score_transform = score_transform
        self.label_index = int(label_index)
        self.return_logits = bool(return_logits)

        # ensure we have a pad token for batching
        if self.tokenizer.pad_token is None:
            # fall back to eos/sep if pad is unset
            self.tokenizer.pad_token = getattr(self.tokenizer, "eos_token", None) or getattr(self.tokenizer, "sep_token", None)

    def _score_logits(self, logits: torch.Tensor) -> torch.Tensor:
        """
        Map logits -> scalar rewards according to `score_transform`.
        Supports both [B, 1] and [B, C] shapes.
        """
        if logits.ndim != 2:
            raise ValueError(f"Expected logits to be 2D [B, C], got shape={tuple(logits.shape)}")
        batch_size, num_labels = logits.shape

        if num_labels == 1:
            scores = logits.squeeze(-1)
            if self.score_transform == "sigmoid":
                scores = torch.sigmoid(scores)
            elif self.score_transform == "identity":
                pass
            elif self.score_transform in ("softmax", "log_softmax"):
                raise ValueError("softmax/log_softmax require num_labels > 1.")
            else:
                raise ValueError(f"Unknown score_transform: {self.score_transform}")
            return scores

        # num_labels > 1
        if not (0 <= self.label_index < num_labels):
            raise IndexError(f"label_index={self.label_index} out of range for num_labels={num_labels}")
        if self.score_transform == "softmax":
            probs = torch.softmax(logits, dim=-1)
            return probs[:, self.label_index]
        elif self.score_transform == "log_softmax":
            log_probs = F.log_softmax(logits, dim=-1)
            return log_probs[:, self.label_index]
        elif self.score_transform == "identity":
            return logits[:, self.label_index]
        elif self.score_transform == "sigmoid":
            # Rarely meaningful for multi-logit heads, but keep for completeness
            return torch.sigmoid(logits[:, self.label_index])
        else:
            raise ValueError(f"Unknown score_transform: {self.score_transform}")

    @torch.no_grad()
    def compute(
        self,
        responses: list[str] | list[dict] | None = None,
        prompts: list[str] | None = None,
        **kwargs: Any,
    ) -> dict[str, Any]:
        """
        Score each response (optionally conditioned on its prompt).

        Args:
            responses: Text to score, or list of generation dicts (with keys 'response' and optionally 'prompt').
            prompts: Optional list of prompts (same length as responses) that will be encoded as text pairs.

        Returns:
            dict[str, Any]: A dict with keys:

                - ``"mean_reward"``: mean reward score over all responses.
                - ``"rewards"``: list of per-sample reward scores in input order.
                - ``"logits"``: (optional) list of raw logits per sample, only included if ``return_logits=True``.
        """
        if not responses:
            return {"mean_reward": 0.0, "rewards": []}

        # Normalize input: allow either list[str] or list[dict]
        if isinstance(responses[0], Mapping):
            gen_dicts = responses
            texts = [d.get("response", "") for d in gen_dicts]

            if prompts is None:
                extracted_prompts = [d.get("prompt") for d in gen_dicts]
                if all(isinstance(p, str) for p in extracted_prompts):
                    prompts = extracted_prompts
                else:
                    prompts = None
        else:
            texts = responses

        if prompts is not None and len(prompts) != len(texts):
            raise AssertionError("If provided, `prompts` must be the same length as `responses`.")

        rewards: list[float] = []
        all_logits: list[list[float]] = []

        for batch_start in range(0, len(texts), self.batch_size):
            response_batch = texts[batch_start : batch_start + self.batch_size]
            if prompts is not None:
                prompt_batch = prompts[batch_start : batch_start + self.batch_size]
                encoding = self.tokenizer(
                    prompt_batch,
                    response_batch,
                    padding=True,
                    truncation=True,
                    max_length=self.max_length,
                    return_tensors="pt",
                )
            else:
                encoding = self.tokenizer(
                    response_batch,
                    padding=True,
                    truncation=True,
                    max_length=self.max_length,
                    return_tensors="pt",
                )

            encoding = {key: value.to(self.device) for key, value in encoding.items()}
            output = self.model(**encoding)
            logits = output.logits  # [B, C]
            batch_scores = self._score_logits(logits)

            rewards.extend(batch_scores.detach().cpu().tolist())
            if self.return_logits:
                all_logits.extend(logits.detach().cpu().tolist())

        result: dict[str, Any] = {
            "mean_reward": float(sum(rewards) / len(rewards)) if rewards else 0.0,
            "rewards": rewards,
        }
        if self.return_logits:
            result["logits"] = all_logits
        return result

`batch_size = int(batch_size)` `instance-attribute`

`device = device or ('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')` `instance-attribute`

`extras = extras` `instance-attribute`

`label_index = int(label_index)` `instance-attribute`

`max_length = max_length` `instance-attribute`

`model = model_or_id` `instance-attribute`

`name = self.class.name` `instance-attribute`

`return_logits = bool(return_logits)` `instance-attribute`

`score_transform = score_transform` `instance-attribute`

`tokenizer = tokenizer` `instance-attribute`

`compute(responses=None, prompts=None, **kwargs)`

Score each response (optionally conditioned on its prompt).

Parameters:

Name	Type	Description	Default
`responses`	`list[str] \| list[dict] \| None`	Text to score, or list of generation dicts (with keys 'response' and optionally 'prompt').	`None`
`prompts`	`list[str] \| None`	Optional list of prompts (same length as responses) that will be encoded as text pairs.	`None`

Returns:

Type	Description
`dict[str, Any]`	dict[str, Any]: A dict with keys: `"mean_reward"`: mean reward score over all responses. `"rewards"`: list of per-sample reward scores in input order. `"logits"`: (optional) list of raw logits per sample, only included if `return_logits=True`.

Source code in aisteer360/evaluation/metrics/generic/reward_score.py

@torch.no_grad()
def compute(
    self,
    responses: list[str] | list[dict] | None = None,
    prompts: list[str] | None = None,
    **kwargs: Any,
) -> dict[str, Any]:
    """
    Score each response (optionally conditioned on its prompt).

    Args:
        responses: Text to score, or list of generation dicts (with keys 'response' and optionally 'prompt').
        prompts: Optional list of prompts (same length as responses) that will be encoded as text pairs.

    Returns:
        dict[str, Any]: A dict with keys:

            - ``"mean_reward"``: mean reward score over all responses.
            - ``"rewards"``: list of per-sample reward scores in input order.
            - ``"logits"``: (optional) list of raw logits per sample, only included if ``return_logits=True``.
    """
    if not responses:
        return {"mean_reward": 0.0, "rewards": []}

    # Normalize input: allow either list[str] or list[dict]
    if isinstance(responses[0], Mapping):
        gen_dicts = responses
        texts = [d.get("response", "") for d in gen_dicts]

        if prompts is None:
            extracted_prompts = [d.get("prompt") for d in gen_dicts]
            if all(isinstance(p, str) for p in extracted_prompts):
                prompts = extracted_prompts
            else:
                prompts = None
    else:
        texts = responses

    if prompts is not None and len(prompts) != len(texts):
        raise AssertionError("If provided, `prompts` must be the same length as `responses`.")

    rewards: list[float] = []
    all_logits: list[list[float]] = []

    for batch_start in range(0, len(texts), self.batch_size):
        response_batch = texts[batch_start : batch_start + self.batch_size]
        if prompts is not None:
            prompt_batch = prompts[batch_start : batch_start + self.batch_size]
            encoding = self.tokenizer(
                prompt_batch,
                response_batch,
                padding=True,
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt",
            )
        else:
            encoding = self.tokenizer(
                response_batch,
                padding=True,
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt",
            )

        encoding = {key: value.to(self.device) for key, value in encoding.items()}
        output = self.model(**encoding)
        logits = output.logits  # [B, C]
        batch_scores = self._score_logits(logits)

        rewards.extend(batch_scores.detach().cpu().tolist())
        if self.return_logits:
            all_logits.extend(logits.detach().cpu().tolist())

    result: dict[str, Any] = {
        "mean_reward": float(sum(rewards) / len(rewards)) if rewards else 0.0,
        "rewards": rewards,
    }
    if self.return_logits:
        result["logits"] = all_logits
    return result

Generic metrics

aisteer360.evaluation.metrics.generic

factuality

Factuality

base_prompt_template = prompt_template.strip() instance-attribute

batch_size = batch_size instance-attribute

device = device or ('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu') instance-attribute

extras = extras instance-attribute

format_instructions = self.output_parser.get_format_instructions() instance-attribute

max_retries = max_retries instance-attribute

model = AutoModelForCausalLM.from_pretrained(model_or_id) instance-attribute

name = self.__class__.__name__ instance-attribute

num_return_sequences = int(gen_kwargs.pop('num_return_sequences', 1)) instance-attribute

pipeline = TextGenerationPipeline(model=(self.model), tokenizer=(self.tokenizer)) instance-attribute

scale = scale instance-attribute

tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id) instance-attribute

use_chat = hasattr(self.tokenizer, 'apply_chat_template') and self.tokenizer.chat_template is not None instance-attribute

compute(responses, prompts=None, **kwargs)

perplexity

Perplexity

add_bos = add_bos and self.tokenizer.bos_token_id is not None instance-attribute

batch_size = batch_size instance-attribute

device = device or ('cuda' if torch.cuda.is_available() else 'cpu') instance-attribute

extras = extras instance-attribute

max_length = max_length instance-attribute

model = AutoModelForCausalLM.from_pretrained(model_or_id) instance-attribute

name = self.__class__.__name__ instance-attribute

tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id) instance-attribute

compute(responses, prompts=None)

relevance

Relevance

base_prompt_template = prompt_template.strip() instance-attribute

batch_size = batch_size instance-attribute

device = device or ('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu') instance-attribute

extras = extras instance-attribute

format_instructions = self.output_parser.get_format_instructions() instance-attribute

max_retries = max_retries instance-attribute

model = AutoModelForCausalLM.from_pretrained(model_or_id) instance-attribute

name = self.__class__.__name__ instance-attribute

num_return_sequences = int(gen_kwargs.pop('num_return_sequences', 1)) instance-attribute

pipeline = TextGenerationPipeline(model=(self.model), tokenizer=(self.tokenizer)) instance-attribute

scale = scale instance-attribute

tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id) instance-attribute

use_chat = hasattr(self.tokenizer, 'apply_chat_template') and self.tokenizer.chat_template is not None instance-attribute

compute(responses, prompts=None, **kwargs)

reward_score

RewardScore

batch_size = int(batch_size) instance-attribute

device = device or ('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu') instance-attribute

extras = extras instance-attribute

label_index = int(label_index) instance-attribute

max_length = max_length instance-attribute

model = model_or_id instance-attribute

name = self.__class__.__name__ instance-attribute

return_logits = bool(return_logits) instance-attribute

score_transform = score_transform instance-attribute

tokenizer = tokenizer instance-attribute

compute(responses=None, prompts=None, **kwargs)

`aisteer360.evaluation.metrics.generic`

`factuality`

`Factuality`

`base_prompt_template = prompt_template.strip()` `instance-attribute`

`batch_size = batch_size` `instance-attribute`

`device = device or ('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')` `instance-attribute`

`extras = extras` `instance-attribute`

`format_instructions = self.output_parser.get_format_instructions()` `instance-attribute`

`max_retries = max_retries` `instance-attribute`

`model = AutoModelForCausalLM.from_pretrained(model_or_id)` `instance-attribute`

`name = self.class.name` `instance-attribute`

`num_return_sequences = int(gen_kwargs.pop('num_return_sequences', 1))` `instance-attribute`

`pipeline = TextGenerationPipeline(model=(self.model), tokenizer=(self.tokenizer))` `instance-attribute`

`scale = scale` `instance-attribute`

`tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id)` `instance-attribute`

`use_chat = hasattr(self.tokenizer, 'apply_chat_template') and self.tokenizer.chat_template is not None` `instance-attribute`

`compute(responses, prompts=None, **kwargs)`

`perplexity`

`Perplexity`

`add_bos = add_bos and self.tokenizer.bos_token_id is not None` `instance-attribute`

`batch_size = batch_size` `instance-attribute`

`device = device or ('cuda' if torch.cuda.is_available() else 'cpu')` `instance-attribute`

`extras = extras` `instance-attribute`

`max_length = max_length` `instance-attribute`

`model = AutoModelForCausalLM.from_pretrained(model_or_id)` `instance-attribute`

`name = self.class.name` `instance-attribute`

`tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id)` `instance-attribute`

`compute(responses, prompts=None)`

`relevance`

`Relevance`

`base_prompt_template = prompt_template.strip()` `instance-attribute`

`batch_size = batch_size` `instance-attribute`

`device = device or ('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')` `instance-attribute`

`extras = extras` `instance-attribute`

`format_instructions = self.output_parser.get_format_instructions()` `instance-attribute`

`max_retries = max_retries` `instance-attribute`

`model = AutoModelForCausalLM.from_pretrained(model_or_id)` `instance-attribute`

`name = self.class.name` `instance-attribute`

`num_return_sequences = int(gen_kwargs.pop('num_return_sequences', 1))` `instance-attribute`

`pipeline = TextGenerationPipeline(model=(self.model), tokenizer=(self.tokenizer))` `instance-attribute`

`scale = scale` `instance-attribute`

`tokenizer = tokenizer or AutoTokenizer.from_pretrained(model_or_id)` `instance-attribute`

`use_chat = hasattr(self.tokenizer, 'apply_chat_template') and self.tokenizer.chat_template is not None` `instance-attribute`

`compute(responses, prompts=None, **kwargs)`

`reward_score`

`RewardScore`

`batch_size = int(batch_size)` `instance-attribute`

`device = device or ('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')` `instance-attribute`

`extras = extras` `instance-attribute`

`label_index = int(label_index)` `instance-attribute`

`max_length = max_length` `instance-attribute`

`model = model_or_id` `instance-attribute`

`name = self.class.name` `instance-attribute`

`return_logits = bool(return_logits)` `instance-attribute`

`score_transform = score_transform` `instance-attribute`

`tokenizer = tokenizer` `instance-attribute`

`compute(responses=None, prompts=None, **kwargs)`