InstructionFollowing

`aisteer360.evaluation.use_cases.instruction_following`

Use case class for the instruction following task.

`use_case`

`InstructionFollowing`

Bases: UseCase

Instruction following use case using the IFEval dataset.

Evaluates model ability to follow specific instructions by testing adherence to various formatting, content, and structural constraints. Uses the IFEval dataset which contains prompts with explicit instructions that models must follow precisely.

The evaluation focuses on whether models can follow instructions like:

Formatting requirements (e.g., "respond in exactly 3 sentences")
Content constraints (e.g., "include the word 'fantastic' twice")
Structural requirements (e.g., "use bullet points", "write in JSON format")

Expected evaluation data format should include fields like 'prompt', 'instructions', 'instruction_id_list', and 'kwargs' for comprehensive instruction following assessment.

Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py

class InstructionFollowing(UseCase):
    """
    Instruction following use case using the IFEval dataset.

    Evaluates model ability to follow specific instructions by testing adherence to
    various formatting, content, and structural constraints. Uses the IFEval dataset
    which contains prompts with explicit instructions that models must follow precisely.

    The evaluation focuses on whether models can follow instructions like:

    - Formatting requirements (e.g., "respond in exactly 3 sentences")
    - Content constraints (e.g., "include the word 'fantastic' twice")
    - Structural requirements (e.g., "use bullet points", "write in JSON format")

    Expected evaluation data format should include fields like 'prompt', 'instructions',
    'instruction_id_list', and 'kwargs' for comprehensive instruction following assessment.
    """

    def validate_evaluation_data(self, evaluation_data: dict[str, Any]) -> None:
        pass

    def generate(
        self,
        model_or_pipeline,
        tokenizer,
        gen_kwargs: dict | None = None,
        runtime_overrides: dict[tuple[str, str], str] | None = None,
    ) -> list[dict[str, Any]]:
        """Generates model responses for instruction following prompts.

        Processes evaluation data to create chat-formatted prompts and generates model responses.

        Args:
            model_or_pipeline: Either a HuggingFace model or SteeringPipeline instance to use for generation.
            tokenizer: Tokenizer for encoding/decoding text.
            gen_kwargs: Optional generation parameters passed to the model's generate method.
            runtime_overrides: Optional runtime parameter overrides for steering controls, structured as {(pipeline_name, param_name): value}.

        Returns:
            List of generation dictionaries, each containing:

                - "response": Generated text response from the model
                - "prompt": Original instruction following prompt
                - "instructions": List of specific instructions the model should follow
                - "instruction_id_list": Identifiers for each instruction type
                - "kwargs": Additional metadata for instruction evaluation
        """
        if not self.evaluation_data:
            print("No evaluation data provided.")
            return []

        gen_kwargs = dict(gen_kwargs or {})
        prompt_data = []

        for instance in self.evaluation_data:
            user_prompt = [{"role": "user", "content": instance["prompt"]}]
            prompt_data.append({"prompt": user_prompt})

        responses = batch_retry_generate(
            prompt_data=prompt_data,
            model_or_pipeline=model_or_pipeline,
            tokenizer=tokenizer,
            gen_kwargs=gen_kwargs,
            runtime_overrides=runtime_overrides,
            evaluation_data=self.evaluation_data,
        )

        generations = [
            {
                "response": response,
                "prompt": eval_data["prompt"],
                "instructions": eval_data["instructions"],
                "instruction_id_list": eval_data["instruction_id_list"],
                "kwargs": eval_data["kwargs"],
            }
            for eval_data, response in zip(self.evaluation_data, responses)
        ]

        return generations

    def evaluate(self, generations: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
        results = {}
        for metric in self.evaluation_metrics:
            results[metric.name] = metric(responses=generations)
        return results

    def export(
        self,
        profiles: dict[str, Any],
        save_dir: str,
    ) -> None:
        """Exports instruction following evaluation results to structured JSON files.

        Creates two output files:

        1. `responses.json`: Contains model responses for each steering method
        2. `scores.json`: Contains strict metric scores for each steering method

        Args:
            profiles: Dictionary containing evaluation results from all tested pipelines.
            save_dir: Directory path where results should be saved.
        """

        folder_path = Path(save_dir)
        folder_path.mkdir(parents=True, exist_ok=True)
        steering_methods, predictions, follow_instructions = [], {}, {}
        inputs = None

        for steering_method, results in profiles.items():
            generations = results.pop("generations")
            steering_methods.append(steering_method)
            predictions[steering_method] = [gen["response"] for gen in generations]

            # get instruction following details from the StrictInstruction metric
            if "StrictInstruction" in results["evaluations"]:
                follow_instructions[steering_method] = results["evaluations"][
                    "StrictInstruction"
                ].pop("follow_all_instructions")
            if not inputs:
                inputs = [gen["prompt"] for gen in generations]

        responses = []
        for idx, prompt in enumerate(inputs):
            response = {"prompt": prompt}
            for method in steering_methods:
                response[method] = predictions[method][idx]
                response[f"{method}_instr_follow"] = follow_instructions[method][idx]
            responses.append(response)

        with open(folder_path / "responses.json", "w") as f:
            json.dump(responses, f, indent=4)
        with open(folder_path / "scores.json", "w") as f:
            json.dump(profiles, f, indent=4)

`evaluation_data = [(json.loads(line)) for line in f] if path.suffix == '.jsonl' else json.load(f)` `instance-attribute`

`evaluation_metrics = evaluation_metrics` `instance-attribute`

`evaluate(generations)`

Required evaluation logic for model's generations via evaluation_metrics.

Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py

def evaluate(self, generations: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
    results = {}
    for metric in self.evaluation_metrics:
        results[metric.name] = metric(responses=generations)
    return results

`export(profiles, save_dir)`

Exports instruction following evaluation results to structured JSON files.

Creates two output files:

responses.json: Contains model responses for each steering method
scores.json: Contains strict metric scores for each steering method

Parameters:

Name	Type	Description	Default
`profiles`	`dict[str, Any]`	Dictionary containing evaluation results from all tested pipelines.	required
`save_dir`	`str`	Directory path where results should be saved.	required

Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py

def export(
    self,
    profiles: dict[str, Any],
    save_dir: str,
) -> None:
    """Exports instruction following evaluation results to structured JSON files.

    Creates two output files:

    1. `responses.json`: Contains model responses for each steering method
    2. `scores.json`: Contains strict metric scores for each steering method

    Args:
        profiles: Dictionary containing evaluation results from all tested pipelines.
        save_dir: Directory path where results should be saved.
    """

    folder_path = Path(save_dir)
    folder_path.mkdir(parents=True, exist_ok=True)
    steering_methods, predictions, follow_instructions = [], {}, {}
    inputs = None

    for steering_method, results in profiles.items():
        generations = results.pop("generations")
        steering_methods.append(steering_method)
        predictions[steering_method] = [gen["response"] for gen in generations]

        # get instruction following details from the StrictInstruction metric
        if "StrictInstruction" in results["evaluations"]:
            follow_instructions[steering_method] = results["evaluations"][
                "StrictInstruction"
            ].pop("follow_all_instructions")
        if not inputs:
            inputs = [gen["prompt"] for gen in generations]

    responses = []
    for idx, prompt in enumerate(inputs):
        response = {"prompt": prompt}
        for method in steering_methods:
            response[method] = predictions[method][idx]
            response[f"{method}_instr_follow"] = follow_instructions[method][idx]
        responses.append(response)

    with open(folder_path / "responses.json", "w") as f:
        json.dump(responses, f, indent=4)
    with open(folder_path / "scores.json", "w") as f:
        json.dump(profiles, f, indent=4)

`generate(model_or_pipeline, tokenizer, gen_kwargs=None, runtime_overrides=None)`

Generates model responses for instruction following prompts.

Processes evaluation data to create chat-formatted prompts and generates model responses.

Parameters:

Name	Type	Description	Default
`model_or_pipeline`		Either a HuggingFace model or SteeringPipeline instance to use for generation.	required
`tokenizer`		Tokenizer for encoding/decoding text.	required
`gen_kwargs`	`dict \| None`	Optional generation parameters passed to the model's generate method.	`None`
`runtime_overrides`	`dict[tuple[str, str], str] \| None`	Optional runtime parameter overrides for steering controls, structured as {(pipeline_name, param_name): value}.	`None`

Returns:

Type	Description
`list[dict[str, Any]]`	List of generation dictionaries, each containing: "response": Generated text response from the model "prompt": Original instruction following prompt "instructions": List of specific instructions the model should follow "instruction_id_list": Identifiers for each instruction type "kwargs": Additional metadata for instruction evaluation

Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py

def generate(
    self,
    model_or_pipeline,
    tokenizer,
    gen_kwargs: dict | None = None,
    runtime_overrides: dict[tuple[str, str], str] | None = None,
) -> list[dict[str, Any]]:
    """Generates model responses for instruction following prompts.

    Processes evaluation data to create chat-formatted prompts and generates model responses.

    Args:
        model_or_pipeline: Either a HuggingFace model or SteeringPipeline instance to use for generation.
        tokenizer: Tokenizer for encoding/decoding text.
        gen_kwargs: Optional generation parameters passed to the model's generate method.
        runtime_overrides: Optional runtime parameter overrides for steering controls, structured as {(pipeline_name, param_name): value}.

    Returns:
        List of generation dictionaries, each containing:

            - "response": Generated text response from the model
            - "prompt": Original instruction following prompt
            - "instructions": List of specific instructions the model should follow
            - "instruction_id_list": Identifiers for each instruction type
            - "kwargs": Additional metadata for instruction evaluation
    """
    if not self.evaluation_data:
        print("No evaluation data provided.")
        return []

    gen_kwargs = dict(gen_kwargs or {})
    prompt_data = []

    for instance in self.evaluation_data:
        user_prompt = [{"role": "user", "content": instance["prompt"]}]
        prompt_data.append({"prompt": user_prompt})

    responses = batch_retry_generate(
        prompt_data=prompt_data,
        model_or_pipeline=model_or_pipeline,
        tokenizer=tokenizer,
        gen_kwargs=gen_kwargs,
        runtime_overrides=runtime_overrides,
        evaluation_data=self.evaluation_data,
    )

    generations = [
        {
            "response": response,
            "prompt": eval_data["prompt"],
            "instructions": eval_data["instructions"],
            "instruction_id_list": eval_data["instruction_id_list"],
            "kwargs": eval_data["kwargs"],
        }
        for eval_data, response in zip(self.evaluation_data, responses)
    ]

    return generations

`validate_evaluation_data(evaluation_data)`

Optional validation of the evaluation dataset.

Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py

def validate_evaluation_data(self, evaluation_data: dict[str, Any]) -> None:
    pass

InstructionFollowing

aisteer360.evaluation.use_cases.instruction_following

use_case

InstructionFollowing

evaluation_data = [(json.loads(line)) for line in f] if path.suffix == '.jsonl' else json.load(f) instance-attribute

evaluation_metrics = evaluation_metrics instance-attribute

evaluate(generations)

export(profiles, save_dir)

generate(model_or_pipeline, tokenizer, gen_kwargs=None, runtime_overrides=None)

validate_evaluation_data(evaluation_data)

`aisteer360.evaluation.use_cases.instruction_following`

`use_case`

`InstructionFollowing`

`evaluation_data = [(json.loads(line)) for line in f] if path.suffix == '.jsonl' else json.load(f)` `instance-attribute`

`evaluation_metrics = evaluation_metrics` `instance-attribute`

`evaluate(generations)`

`export(profiles, save_dir)`

`generate(model_or_pipeline, tokenizer, gen_kwargs=None, runtime_overrides=None)`

`validate_evaluation_data(evaluation_data)`