Adding a structural control method
Required override: steer
Structural control methods modify the model's weights or underlying architecture, creating a new model. This tutorial
implements a NoiseInjection
method that perturbs a model's weights by (scaled) Gaussian noise.
The registry follows the standard pattern as:
from .control import NoiseInjection
from .args import NoiseInjectionArgs
REGISTRY_ENTRY = {
"category": "structural_control",
"name": "noise_injection",
"control": NoiseInjection,
"args": NoiseInjectionArgs,
}
Next, the args dataclass contains three parameters: noise_scale
controlling the standard deviation of Gaussian noise
to inject, target_modules
specifying which layer patterns to modify (or None for all linear layers), and seed
ensuring reproducible noise generation. Note that (as indicated in
the general instructions for the arguments dataclass), the
field for target_modules
must contain default_factory=list
instead of simply default
.
from dataclasses import dataclass, field
from aisteer360.algorithms.core.base_args import BaseArgs
@dataclass
class NoiseInjectionArgs(BaseArgs):
noise_scale: float = field(
default=0.01,
metadata={"help": "Standard deviation of Gaussian noise to inject, in [0, 1]."},
)
target_modules: list[str] | None = field(
default_factory=list,
metadata={"help": "List of module name patterns to target. None means all linear layers."},
)
seed: int = field(
default=42,
metadata={"help": "Random seed for reproducible noise generation."},
)
# validation
def __post_init__(self):
if not (0.0 <= self.noise_scale <= 1.0):
raise ValueError("`noise_scale` must be in the interval [0, 1].")
if self.target_modules is not None:
if not isinstance(self.target_modules, list):
raise TypeError("`target_modules` must be a list of strings or None.")
if not all(isinstance(module, str) for module in self.target_modules):
raise TypeError("All elements in `target_modules` must be strings.")
if len(self.target_modules) == 0:
raise ValueError("`target_modules` cannot be an empty list. Use None for all modules.")
Lastly, the control is implemented via the steer
method by defining the heads to prune and shrinking the model’s
weight tensors in-place (via the Hugging Face's built-in prune_heads
utility).
import torch
from transformers import PreTrainedModel, PreTrainedTokenizer
from aisteer360.algorithms.structural_control.base import StructuralControl
from aisteer360.algorithms.structural_control.noise_injection.args import NoiseInjectionArgs
class NoiseInjection(StructuralControl):
"""Injects controlled Gaussian noise into model weights (e.g., for robustness testing)."""
Args = NoiseInjectionArgs
def steer(
self,
model: PreTrainedModel,
tokenizer: PreTrainedTokenizer = None,
**kwargs
) -> PreTrainedModel:
torch.manual_seed(self.seed)
with torch.no_grad():
for name, module in model.named_modules():
# check if this is a Linear layer and matches target patterns
if not isinstance(module, torch.nn.Linear):
continue
# if no specific targets, inject into all Linear layers; otherwise, check if module name contains any
# target pattern
if self.target_modules is not None:
if not any(target in name for target in self.target_modules):
continue
# inject noise into all parameters of this module
for param_name, param in module.named_parameters():
if param.requires_grad:
noise = torch.randn_like(param) * self.noise_scale
param.data.add_(noise)
return model
The control can then be called via:
from aisteer360.algorithms.structural_control.noise_injection.control import NoiseInjection
from aisteer360.algorithms.core.steering_pipeline import SteeringPipeline
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
noise_injection = NoiseInjection(
noise_scale=0.005,
target_modules=["q_proj", "v_proj"],
seed=42
)
noise_injection_pipeline = SteeringPipeline(
model_name_or_path=MODEL_NAME,
controls=[noise_injection]
)
noise_injection_pipeline.steer()
prompt = "What is a neural network?"
chat = noise_injection_pipeline.tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
tokenize=False,
add_generation_prompt=True
)
inputs = noise_injection_pipeline.tokenizer(chat, return_tensors="pt")
print(noise_injection_pipeline.generate_text(inputs.input_ids, max_new_tokens=50))