Skip to content

InstructionFollowing

aisteer360.evaluation.use_cases.instruction_following

Use case class for the instruction following task.

use_case

InstructionFollowing

Bases: UseCase

Instruction following use case using the IFEval dataset.

Evaluates model ability to follow specific instructions by testing adherence to various formatting, content, and structural constraints. Uses the IFEval dataset which contains prompts with explicit instructions that models must follow precisely.

The evaluation focuses on whether models can follow instructions like:

  • Formatting requirements (e.g., "respond in exactly 3 sentences")
  • Content constraints (e.g., "include the word 'fantastic' twice")
  • Structural requirements (e.g., "use bullet points", "write in JSON format")

Expected evaluation data format should include fields like 'prompt', 'instructions', 'instruction_id_list', and 'kwargs' for comprehensive instruction following assessment.

Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
class InstructionFollowing(UseCase):
    """
    Instruction following use case using the IFEval dataset.

    Evaluates model ability to follow specific instructions by testing adherence to
    various formatting, content, and structural constraints. Uses the IFEval dataset
    which contains prompts with explicit instructions that models must follow precisely.

    The evaluation focuses on whether models can follow instructions like:

    - Formatting requirements (e.g., "respond in exactly 3 sentences")
    - Content constraints (e.g., "include the word 'fantastic' twice")
    - Structural requirements (e.g., "use bullet points", "write in JSON format")

    Expected evaluation data format should include fields like 'prompt', 'instructions',
    'instruction_id_list', and 'kwargs' for comprehensive instruction following assessment.
    """

    def validate_evaluation_data(self, evaluation_data: dict[str, Any]) -> None:
        pass

    def generate(
        self,
        model_or_pipeline,
        tokenizer,
        gen_kwargs: dict | None = None,
        runtime_overrides: dict[tuple[str, str], str] | None = None,
    ) -> list[dict[str, Any]]:
        """Generates model responses for instruction following prompts.

        Processes evaluation data to create chat-formatted prompts and generates model responses.

        Args:
            model_or_pipeline: Either a HuggingFace model or SteeringPipeline instance to use for generation.
            tokenizer: Tokenizer for encoding/decoding text.
            gen_kwargs: Optional generation parameters passed to the model's generate method.
            runtime_overrides: Optional runtime parameter overrides for steering controls, structured as {(pipeline_name, param_name): value}.

        Returns:
            List of generation dictionaries, each containing:

                - "response": Generated text response from the model
                - "prompt": Original instruction following prompt
                - "instructions": List of specific instructions the model should follow
                - "instruction_id_list": Identifiers for each instruction type
                - "kwargs": Additional metadata for instruction evaluation
        """
        if not self.evaluation_data:
            print("No evaluation data provided.")
            return []

        gen_kwargs = dict(gen_kwargs or {})
        prompt_data = []

        for instance in self.evaluation_data:
            user_prompt = [{"role": "user", "content": instance["prompt"]}]
            prompt_data.append({"prompt": user_prompt})

        responses = batch_retry_generate(
            prompt_data=prompt_data,
            model_or_pipeline=model_or_pipeline,
            tokenizer=tokenizer,
            gen_kwargs=gen_kwargs,
            runtime_overrides=runtime_overrides,
            evaluation_data=self.evaluation_data,
        )

        generations = [
            {
                "response": response,
                "prompt": eval_data["prompt"],
                "instructions": eval_data["instructions"],
                "instruction_id_list": eval_data["instruction_id_list"],
                "kwargs": eval_data["kwargs"],
            }
            for eval_data, response in zip(self.evaluation_data, responses)
        ]

        return generations

    def evaluate(self, generations: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
        results = {}
        for metric in self.evaluation_metrics:
            results[metric.name] = metric(responses=generations)
        return results

    def export(
        self,
        profiles: dict[str, Any],
        save_dir: str,
    ) -> None:
        """Exports instruction following evaluation results to structured JSON files.

        Creates two output files:

        1. `responses.json`: Contains model responses for each steering method
        2. `scores.json`: Contains strict metric scores for each steering method

        Args:
            profiles: Dictionary containing evaluation results from all tested pipelines.
            save_dir: Directory path where results should be saved.
        """

        folder_path = Path(save_dir)
        folder_path.mkdir(parents=True, exist_ok=True)
        steering_methods, predictions, follow_instructions = [], {}, {}
        inputs = None

        for steering_method, results in profiles.items():
            generations = results.pop("generations")
            steering_methods.append(steering_method)
            predictions[steering_method] = [gen["response"] for gen in generations]

            # get instruction following details from the StrictInstruction metric
            if "StrictInstruction" in results["evaluations"]:
                follow_instructions[steering_method] = results["evaluations"][
                    "StrictInstruction"
                ].pop("follow_all_instructions")
            if not inputs:
                inputs = [gen["prompt"] for gen in generations]

        responses = []
        for idx, prompt in enumerate(inputs):
            response = {"prompt": prompt}
            for method in steering_methods:
                response[method] = predictions[method][idx]
                response[f"{method}_instr_follow"] = follow_instructions[method][idx]
            responses.append(response)

        with open(folder_path / "responses.json", "w") as f:
            json.dump(responses, f, indent=4)
        with open(folder_path / "scores.json", "w") as f:
            json.dump(profiles, f, indent=4)
evaluation_data = [(json.loads(line)) for line in f] if path.suffix == '.jsonl' else json.load(f) instance-attribute
evaluation_metrics = evaluation_metrics instance-attribute
evaluate(generations)

Required evaluation logic for model's generations via evaluation_metrics.

Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py
91
92
93
94
95
def evaluate(self, generations: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
    results = {}
    for metric in self.evaluation_metrics:
        results[metric.name] = metric(responses=generations)
    return results
export(profiles, save_dir)

Exports instruction following evaluation results to structured JSON files.

Creates two output files:

  1. responses.json: Contains model responses for each steering method
  2. scores.json: Contains strict metric scores for each steering method

Parameters:

Name Type Description Default
profiles dict[str, Any]

Dictionary containing evaluation results from all tested pipelines.

required
save_dir str

Directory path where results should be saved.

required
Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
def export(
    self,
    profiles: dict[str, Any],
    save_dir: str,
) -> None:
    """Exports instruction following evaluation results to structured JSON files.

    Creates two output files:

    1. `responses.json`: Contains model responses for each steering method
    2. `scores.json`: Contains strict metric scores for each steering method

    Args:
        profiles: Dictionary containing evaluation results from all tested pipelines.
        save_dir: Directory path where results should be saved.
    """

    folder_path = Path(save_dir)
    folder_path.mkdir(parents=True, exist_ok=True)
    steering_methods, predictions, follow_instructions = [], {}, {}
    inputs = None

    for steering_method, results in profiles.items():
        generations = results.pop("generations")
        steering_methods.append(steering_method)
        predictions[steering_method] = [gen["response"] for gen in generations]

        # get instruction following details from the StrictInstruction metric
        if "StrictInstruction" in results["evaluations"]:
            follow_instructions[steering_method] = results["evaluations"][
                "StrictInstruction"
            ].pop("follow_all_instructions")
        if not inputs:
            inputs = [gen["prompt"] for gen in generations]

    responses = []
    for idx, prompt in enumerate(inputs):
        response = {"prompt": prompt}
        for method in steering_methods:
            response[method] = predictions[method][idx]
            response[f"{method}_instr_follow"] = follow_instructions[method][idx]
        responses.append(response)

    with open(folder_path / "responses.json", "w") as f:
        json.dump(responses, f, indent=4)
    with open(folder_path / "scores.json", "w") as f:
        json.dump(profiles, f, indent=4)
generate(model_or_pipeline, tokenizer, gen_kwargs=None, runtime_overrides=None)

Generates model responses for instruction following prompts.

Processes evaluation data to create chat-formatted prompts and generates model responses.

Parameters:

Name Type Description Default
model_or_pipeline

Either a HuggingFace model or SteeringPipeline instance to use for generation.

required
tokenizer

Tokenizer for encoding/decoding text.

required
gen_kwargs dict | None

Optional generation parameters passed to the model's generate method.

None
runtime_overrides dict[tuple[str, str], str] | None

Optional runtime parameter overrides for steering controls, structured as {(pipeline_name, param_name): value}.

None

Returns:

Type Description
list[dict[str, Any]]

List of generation dictionaries, each containing:

  • "response": Generated text response from the model
  • "prompt": Original instruction following prompt
  • "instructions": List of specific instructions the model should follow
  • "instruction_id_list": Identifiers for each instruction type
  • "kwargs": Additional metadata for instruction evaluation
Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def generate(
    self,
    model_or_pipeline,
    tokenizer,
    gen_kwargs: dict | None = None,
    runtime_overrides: dict[tuple[str, str], str] | None = None,
) -> list[dict[str, Any]]:
    """Generates model responses for instruction following prompts.

    Processes evaluation data to create chat-formatted prompts and generates model responses.

    Args:
        model_or_pipeline: Either a HuggingFace model or SteeringPipeline instance to use for generation.
        tokenizer: Tokenizer for encoding/decoding text.
        gen_kwargs: Optional generation parameters passed to the model's generate method.
        runtime_overrides: Optional runtime parameter overrides for steering controls, structured as {(pipeline_name, param_name): value}.

    Returns:
        List of generation dictionaries, each containing:

            - "response": Generated text response from the model
            - "prompt": Original instruction following prompt
            - "instructions": List of specific instructions the model should follow
            - "instruction_id_list": Identifiers for each instruction type
            - "kwargs": Additional metadata for instruction evaluation
    """
    if not self.evaluation_data:
        print("No evaluation data provided.")
        return []

    gen_kwargs = dict(gen_kwargs or {})
    prompt_data = []

    for instance in self.evaluation_data:
        user_prompt = [{"role": "user", "content": instance["prompt"]}]
        prompt_data.append({"prompt": user_prompt})

    responses = batch_retry_generate(
        prompt_data=prompt_data,
        model_or_pipeline=model_or_pipeline,
        tokenizer=tokenizer,
        gen_kwargs=gen_kwargs,
        runtime_overrides=runtime_overrides,
        evaluation_data=self.evaluation_data,
    )

    generations = [
        {
            "response": response,
            "prompt": eval_data["prompt"],
            "instructions": eval_data["instructions"],
            "instruction_id_list": eval_data["instruction_id_list"],
            "kwargs": eval_data["kwargs"],
        }
        for eval_data, response in zip(self.evaluation_data, responses)
    ]

    return generations
validate_evaluation_data(evaluation_data)

Optional validation of the evaluation dataset.

Source code in aisteer360/evaluation/use_cases/instruction_following/use_case.py
29
30
def validate_evaluation_data(self, evaluation_data: dict[str, Any]) -> None:
    pass