Skip to content

InstructionFollowing

aisteer360.evaluation.use_cases.instruction_following

Use case class for the instruction following task.

use_case

InstructionFollowing

Bases: UseCase

Instruction following evaluation use case using the IFEval dataset.

Evaluates model ability to follow specific instructions by testing adherence to various formatting, content, and structural constraints. Uses the IFEval dataset which contains prompts with explicit instructions that models must follow precisely.

The evaluation focuses on whether models can follow instructions like formatting requirements (e.g., "respond in exactly 3 sentences"), content constraints (e.g., "include the word 'fantastic' twice"), and structural requirements (e.g., "use bullet points", "write in JSON format").

Attributes:

Name Type Description
evaluation_data

List of instances containing prompts and instruction metadata.

Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
class InstructionFollowing(UseCase):
    """Instruction following evaluation use case using the IFEval dataset.

    Evaluates model ability to follow specific instructions by testing adherence to various formatting, content, and
    structural constraints. Uses the IFEval dataset which contains prompts with explicit instructions that models must
    follow precisely.

    The evaluation focuses on whether models can follow instructions like formatting requirements (e.g., "respond in
    exactly 3 sentences"), content constraints (e.g., "include the word 'fantastic' twice"), and structural
    requirements (e.g., "use bullet points", "write in JSON format").

    Attributes:
        evaluation_data: List of instances containing prompts and instruction metadata.
    """

    def validate_evaluation_data(self, evaluation_data: dict[str, Any]) -> None:
        """Validates that evaluation data contains required fields for instruction following evaluation.

        Ensures each data instance has the necessary keys for the evaluation.

        Args:
            evaluation_data: Dictionary containing a single evaluation instance with prompt, instructions, and metadata.

        Raises:
            ValueError: If required keys ('prompt', 'instructions', 'instruction_id_list', 'kwargs') are missing.
        """
        missing_keys = [key for key in _EVALUATION_REQ_KEYS if key not in evaluation_data]
        if missing_keys:
            raise ValueError(f"Missing required keys: {missing_keys}")

    def generate(
        self,
        model_or_pipeline,
        tokenizer,
        gen_kwargs: dict | None = None,
        runtime_overrides: dict[tuple[str, str], str] | None = None,
        **__
    ) -> list[dict[str, Any]]:
        """Generates model responses for instruction following prompts.

        Processes evaluation data to create chat-formatted prompts and generates model responses.

        Args:
            model_or_pipeline: Either a HuggingFace model or SteeringPipeline instance to use for generation.
            tokenizer: Tokenizer for encoding/decoding text.
            gen_kwargs: Optional generation parameters passed to the model's generate method.
            runtime_overrides: Optional runtime parameter overrides for steering controls, structured as {(pipeline_name, param_name): value}.

        Returns:
            List of generation dictionaries, each containing:

                - "response": Generated text response from the model
                - "prompt": Original instruction following prompt
                - "instructions": List of specific instructions the model should follow
                - "instruction_id_list": Identifiers for each instruction type
                - "kwargs": Additional metadata for instruction evaluation
        """
        if not self.evaluation_data:
            print("No evaluation data provided.")
            return []

        gen_kwargs = dict(gen_kwargs or {})
        prompt_data = []

        for instance in self.evaluation_data:
            user_prompt = [{"role": "user", "content": instance["prompt"]}]
            prompt_data.append({"prompt": user_prompt})

        responses = batch_retry_generate(
            prompt_data=prompt_data,
            model_or_pipeline=model_or_pipeline,
            tokenizer=tokenizer,
            gen_kwargs=gen_kwargs,
            runtime_overrides=runtime_overrides,
            evaluation_data=self.evaluation_data,
        )

        generations = [
            {
                "response": response,
                "prompt": eval_data["prompt"],
                "instructions": eval_data["instructions"],
                "instruction_id_list": eval_data["instruction_id_list"],
                "kwargs": eval_data["kwargs"],
            }
            for eval_data, response in zip(self.evaluation_data, responses)
        ]

        return generations

    def evaluate(self, generations: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
        """Evaluates generated responses against instruction requirements using configured metrics.

        Passes generation dictionaries to all evaluation metrics specified during initialization.

        Args:
            generations: List of generation dictionaries returned by the `generate()` method, each containing
                response, prompt, instructions, instruction_id_list, and kwargs fields.

        Returns:
            Dictionary of scores keyed by `metric_name`.
        """
        results = {}
        for metric in self.evaluation_metrics:
            results[metric.name] = metric(responses=generations)
        return results

    def export(self, profiles: dict[str, Any], save_dir: str) -> None:
        """Exports instruction following evaluation results to structured JSON files.

        Creates two output files:

            1. `responses.json`: Contains model responses for each steering method
            2. `scores.json`: Contains strict metric scores for each steering method

        Args:
            profiles: Dictionary containing evaluation results from all tested pipelines.
            save_dir: Directory path where results should be saved.
        """
        folder_path = Path(save_dir)
        folder_path.mkdir(parents=True, exist_ok=True)
        steering_methods, predictions, follow_instructions = [], {}, {}
        inputs = None

        for steering_method, results in profiles.items():
            generations = results.pop("generations")
            steering_methods.append(steering_method)
            predictions[steering_method] = [gen["response"] for gen in generations]

            # get instruction following details from the StrictInstruction metric
            if "StrictInstruction" in results["evaluations"]:
                follow_instructions[steering_method] = results["evaluations"][
                    "StrictInstruction"
                ].pop("follow_all_instructions")
            if not inputs:
                inputs = [gen["prompt"] for gen in generations]

        responses = []
        for idx, prompt in enumerate(inputs):
            response = {"prompt": prompt}
            for method in steering_methods:
                response[method] = predictions[method][idx]
                response[f"{method}_instr_follow"] = follow_instructions[method][idx]
            responses.append(response)

        with open(folder_path / "responses.json", "w") as f:
            json.dump(responses, f, indent=4)
        with open(folder_path / "scores.json", "w") as f:
            json.dump(profiles, f, indent=4)
evaluation_data = [(json.loads(line)) for line in f] if path.suffix == '.jsonl' else json.load(f) instance-attribute
evaluation_metrics = evaluation_metrics instance-attribute
evaluate(generations)

Evaluates generated responses against instruction requirements using configured metrics.

Passes generation dictionaries to all evaluation metrics specified during initialization.

Parameters:

Name Type Description Default
generations list[dict[str, Any]]

List of generation dictionaries returned by the generate() method, each containing response, prompt, instructions, instruction_id_list, and kwargs fields.

required

Returns:

Type Description
dict[str, dict[str, Any]]

Dictionary of scores keyed by metric_name.

Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def evaluate(self, generations: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
    """Evaluates generated responses against instruction requirements using configured metrics.

    Passes generation dictionaries to all evaluation metrics specified during initialization.

    Args:
        generations: List of generation dictionaries returned by the `generate()` method, each containing
            response, prompt, instructions, instruction_id_list, and kwargs fields.

    Returns:
        Dictionary of scores keyed by `metric_name`.
    """
    results = {}
    for metric in self.evaluation_metrics:
        results[metric.name] = metric(responses=generations)
    return results
export(profiles, save_dir)

Exports instruction following evaluation results to structured JSON files.

Creates two output files:

1. `responses.json`: Contains model responses for each steering method
2. `scores.json`: Contains strict metric scores for each steering method

Parameters:

Name Type Description Default
profiles dict[str, Any]

Dictionary containing evaluation results from all tested pipelines.

required
save_dir str

Directory path where results should be saved.

required
Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def export(self, profiles: dict[str, Any], save_dir: str) -> None:
    """Exports instruction following evaluation results to structured JSON files.

    Creates two output files:

        1. `responses.json`: Contains model responses for each steering method
        2. `scores.json`: Contains strict metric scores for each steering method

    Args:
        profiles: Dictionary containing evaluation results from all tested pipelines.
        save_dir: Directory path where results should be saved.
    """
    folder_path = Path(save_dir)
    folder_path.mkdir(parents=True, exist_ok=True)
    steering_methods, predictions, follow_instructions = [], {}, {}
    inputs = None

    for steering_method, results in profiles.items():
        generations = results.pop("generations")
        steering_methods.append(steering_method)
        predictions[steering_method] = [gen["response"] for gen in generations]

        # get instruction following details from the StrictInstruction metric
        if "StrictInstruction" in results["evaluations"]:
            follow_instructions[steering_method] = results["evaluations"][
                "StrictInstruction"
            ].pop("follow_all_instructions")
        if not inputs:
            inputs = [gen["prompt"] for gen in generations]

    responses = []
    for idx, prompt in enumerate(inputs):
        response = {"prompt": prompt}
        for method in steering_methods:
            response[method] = predictions[method][idx]
            response[f"{method}_instr_follow"] = follow_instructions[method][idx]
        responses.append(response)

    with open(folder_path / "responses.json", "w") as f:
        json.dump(responses, f, indent=4)
    with open(folder_path / "scores.json", "w") as f:
        json.dump(profiles, f, indent=4)
generate(model_or_pipeline, tokenizer, gen_kwargs=None, runtime_overrides=None, **__)

Generates model responses for instruction following prompts.

Processes evaluation data to create chat-formatted prompts and generates model responses.

Parameters:

Name Type Description Default
model_or_pipeline

Either a HuggingFace model or SteeringPipeline instance to use for generation.

required
tokenizer

Tokenizer for encoding/decoding text.

required
gen_kwargs dict | None

Optional generation parameters passed to the model's generate method.

None
runtime_overrides dict[tuple[str, str], str] | None

Optional runtime parameter overrides for steering controls, structured as {(pipeline_name, param_name): value}.

None

Returns:

Type Description
list[dict[str, Any]]

List of generation dictionaries, each containing:

  • "response": Generated text response from the model
  • "prompt": Original instruction following prompt
  • "instructions": List of specific instructions the model should follow
  • "instruction_id_list": Identifiers for each instruction type
  • "kwargs": Additional metadata for instruction evaluation
Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def generate(
    self,
    model_or_pipeline,
    tokenizer,
    gen_kwargs: dict | None = None,
    runtime_overrides: dict[tuple[str, str], str] | None = None,
    **__
) -> list[dict[str, Any]]:
    """Generates model responses for instruction following prompts.

    Processes evaluation data to create chat-formatted prompts and generates model responses.

    Args:
        model_or_pipeline: Either a HuggingFace model or SteeringPipeline instance to use for generation.
        tokenizer: Tokenizer for encoding/decoding text.
        gen_kwargs: Optional generation parameters passed to the model's generate method.
        runtime_overrides: Optional runtime parameter overrides for steering controls, structured as {(pipeline_name, param_name): value}.

    Returns:
        List of generation dictionaries, each containing:

            - "response": Generated text response from the model
            - "prompt": Original instruction following prompt
            - "instructions": List of specific instructions the model should follow
            - "instruction_id_list": Identifiers for each instruction type
            - "kwargs": Additional metadata for instruction evaluation
    """
    if not self.evaluation_data:
        print("No evaluation data provided.")
        return []

    gen_kwargs = dict(gen_kwargs or {})
    prompt_data = []

    for instance in self.evaluation_data:
        user_prompt = [{"role": "user", "content": instance["prompt"]}]
        prompt_data.append({"prompt": user_prompt})

    responses = batch_retry_generate(
        prompt_data=prompt_data,
        model_or_pipeline=model_or_pipeline,
        tokenizer=tokenizer,
        gen_kwargs=gen_kwargs,
        runtime_overrides=runtime_overrides,
        evaluation_data=self.evaluation_data,
    )

    generations = [
        {
            "response": response,
            "prompt": eval_data["prompt"],
            "instructions": eval_data["instructions"],
            "instruction_id_list": eval_data["instruction_id_list"],
            "kwargs": eval_data["kwargs"],
        }
        for eval_data, response in zip(self.evaluation_data, responses)
    ]

    return generations
validate_evaluation_data(evaluation_data)

Validates that evaluation data contains required fields for instruction following evaluation.

Ensures each data instance has the necessary keys for the evaluation.

Parameters:

Name Type Description Default
evaluation_data dict[str, Any]

Dictionary containing a single evaluation instance with prompt, instructions, and metadata.

required

Raises:

Type Description
ValueError

If required keys ('prompt', 'instructions', 'instruction_id_list', 'kwargs') are missing.

Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def validate_evaluation_data(self, evaluation_data: dict[str, Any]) -> None:
    """Validates that evaluation data contains required fields for instruction following evaluation.

    Ensures each data instance has the necessary keys for the evaluation.

    Args:
        evaluation_data: Dictionary containing a single evaluation instance with prompt, instructions, and metadata.

    Raises:
        ValueError: If required keys ('prompt', 'instructions', 'instruction_id_list', 'kwargs') are missing.
    """
    missing_keys = [key for key in _EVALUATION_REQ_KEYS if key not in evaluation_data]
    if missing_keys:
        raise ValueError(f"Missing required keys: {missing_keys}")