lm-evaluation-harness advanced usage#

Use lm-evaluation extension from code to have additional control over concurrency or execution options

Note:

This is for advanced usage only, use CLI in most cases (lm_eval_cli example)

See lm_eval_model.py on GitHub.#
import logging
from pprint import pprint

from dotenv import load_dotenv
from lm_eval import simple_evaluate

from genai import Client, Credentials
from genai.extensions.lm_eval.model import IBMGenAILMEval
from genai.schema import TextGenerationParameters

load_dotenv()

logging.getLogger("httpx").setLevel(logging.WARN)
logging.getLogger("genai").setLevel(logging.WARN)

task_name = "arc_challenge"
model_id = "mistralai/mixtral-8x7b-instruct-v01"
num_fewshot = 25
limit = 10  # WARNING: only for debug purposes, set None for full testing dataset

client = Client(
    credentials=Credentials.from_env(),
    config={"api_client_config": {"transport_options": {"retries": 999}}},
)
model = IBMGenAILMEval(
    client=client,
    model_id=model_id,
    show_progressbar=True,
    parameters=TextGenerationParameters(temperature=0),
)
results = simple_evaluate(model, tasks=[task_name], num_fewshot=num_fewshot, log_samples=False, limit=limit)

# add info about the model and few shot config
# "model_kwargs": model_kwargs,
results["config"] = {"model": model_id, "use_cache": False, "limit": limit, "model_kwargs": model.dump_parameters()}

pprint(results)