Skip to content

Leaderboard

HTML leaderboard generation from RAG pattern evaluation results.

leaderboard

Functions

build_leaderboard_html

build_leaderboard_html(patterns_dir: str | Path, optimization_metric: str = 'faithfulness') -> str

Build a styled HTML leaderboard from RAG pattern evaluation results.

Scans each subdirectory of patterns_dir for a pattern.json file, ranks patterns by the specified optimisation metric, and produces a self-contained HTML document with a responsive leaderboard table.

Parameters:

  • patterns_dir (str | Path) –

    Directory of RAG patterns; each subdirectory must contain a pattern.json file.

  • optimization_metric (str, default: "faithfulness" ) –

    Name of the metric used to rank patterns (e.g. "faithfulness", "answer_correctness").

Returns:

  • str

    Self-contained HTML document with the leaderboard table.

Raises:

  • FileNotFoundError

    If patterns_dir does not exist or is not a directory.

Source code in ai4rag/components/assets_generator/leaderboard.py
def build_leaderboard_html(  # pylint: disable=too-many-locals,too-many-branches,too-many-statements
    patterns_dir: str | Path, optimization_metric: str = "faithfulness"
) -> str:
    """Build a styled HTML leaderboard from RAG pattern evaluation results.

    Scans each subdirectory of *patterns_dir* for a ``pattern.json`` file,
    ranks patterns by the specified optimisation metric, and produces a
    self-contained HTML document with a responsive leaderboard table.

    Parameters
    ----------
    patterns_dir : str | Path
        Directory of RAG patterns; each subdirectory must contain a
        ``pattern.json`` file.
    optimization_metric : str, default="faithfulness"
        Name of the metric used to rank patterns (e.g.
        ``"faithfulness"``, ``"answer_correctness"``).

    Returns
    -------
    str
        Self-contained HTML document with the leaderboard table.

    Raises
    ------
    FileNotFoundError
        If *patterns_dir* does not exist or is not a directory.
    """
    rag_patterns_dir = Path(patterns_dir)
    if not rag_patterns_dir.is_dir():
        raise FileNotFoundError(f"rag_patterns path is not a directory: {rag_patterns_dir}")

    evaluations: list[dict] = []
    for subdir in sorted(rag_patterns_dir.iterdir()):
        if not subdir.is_dir():
            continue
        pattern_file = subdir / "pattern.json"
        if not pattern_file.is_file():
            continue
        with pattern_file.open("r", encoding="utf-8") as f:
            evaluations.append(json.load(f))

    # Sort by optimization metric score descending; missing scores last
    def _optimization_score(e: dict) -> tuple[bool, float]:
        v = e.get("final_score")
        if v is not None:
            try:
                return (False, -float(v))
            except (TypeError, ValueError):
                pass
        raw = e.get("scores") or {}
        aggregate = raw.get("scores") if isinstance(raw.get("scores"), dict) else raw
        for _k, info in (aggregate or {}).items():
            if isinstance(info, dict):
                mean = info.get("mean")
                if mean is not None:
                    try:
                        return (False, -float(mean))
                    except (TypeError, ValueError):
                        pass
        return (True, 0)

    evaluations.sort(key=_optimization_score)

    # Discover metric columns present in data
    all_metric_names: list[str] = []
    for e in evaluations:
        raw = e.get("scores") or {}
        aggregate = raw.get("scores") if isinstance(raw.get("scores"), dict) else raw
        for m in aggregate or {}:
            if m not in all_metric_names:
                all_metric_names.append(m)
    metric_columns = [c for c in _LEADERBOARD_METRIC_COLUMNS if c.replace("mean_", "", 1) in all_metric_names]
    for m in all_metric_names:
        col = _metric_to_mean_key(m)
        if col not in metric_columns:
            metric_columns.append(col)

    # Place the optimisation metric column right after Pattern_Name
    opt_metric_col = _metric_to_mean_key(optimization_metric or "faithfulness")
    if opt_metric_col in metric_columns:
        other_metrics = [c for c in metric_columns if c != opt_metric_col]
        metric_columns = [opt_metric_col] + other_metrics

    config_columns = list(_LEADERBOARD_CONFIG_COLUMNS)
    headers = ["Pattern_Name"] + metric_columns + config_columns
    header_row = "".join(f"<th>{_header_two_lines(h)}</th>" for h in headers)

    # Build rows and collect cell values for dynamic column width computation
    rows_cells: list[list[str]] = []
    rows: list[str] = []
    for i, e in enumerate(evaluations):
        pattern_name = e.get("name") or e.get("pattern_name") or (e.get("rag_pattern") or {}).get("name", "—")
        raw = e.get("scores") or {}
        scores = raw.get("scores") if isinstance(raw.get("scores"), dict) else raw
        merged = (
            _settings_from_rag_pattern(e)
            or _normalize_flat_settings(e.get("settings"))
            or _merge_params(e.get("indexing_params") or {}, e.get("rag_params") or {})
        )

        cells: list[str] = [str(pattern_name)]
        for col in metric_columns:
            metric_name = col.replace("mean_", "", 1)
            info = scores.get(metric_name) or {}
            mean = info.get("mean")
            if mean is not None:
                cell = f"{mean:.4f}" if isinstance(mean, (int, float)) else str(mean)
            else:
                cell = ""
            cells.append(cell)
        for col in config_columns:
            val = _get_config_value(merged, col) if merged else None
            if val is not None and (val != "" or col != "retrieval.ranker_strategy"):
                if isinstance(val, dict):
                    cells.append(json.dumps(val, sort_keys=True))
                else:
                    cells.append(str(val))
            elif col == "retrieval.ranker_strategy":
                cells.append("-")
            else:
                cells.append("")
        rows_cells.append(cells)
        tr_class = ' class="rank-1"' if i == 0 else ""
        rows.append("<tr" + tr_class + ">" + "".join(f"<td>{html.escape(c)}</td>" for c in cells) + "</tr>")

    table_body = "".join(rows)

    # Dynamic column widths from content
    ncols = len(headers)
    column_max_len = [
        max(
            len(headers[i]),
            max((len(rows_cells[r][i]) for r in range(len(rows_cells)))) if rows_cells else 0,
        )
        for i in range(ncols)
    ]
    width_rem: list[float] = []
    for i in range(ncols):
        if i in (7, 12):
            min_rem = 18  # embeddings.model_id, generation.model_id
        elif i == 0:
            min_rem = 10
        elif i == 1:
            min_rem = 12
        else:
            min_rem = 4
        if "." in headers[i]:
            parts = headers[i].split(".", 1)
            line1_len = len(parts[0])
            line2 = parts[1].replace("_", " ") if len(parts) > 1 else ""
            if " " in line2:
                last_space = line2.rfind(" ")
                seg_lens = [line1_len, len(line2[:last_space]), len(line2[last_space + 1 :])]
            else:
                seg_lens = [line1_len, len(line2)] if line2 else [line1_len]
            min_for_two_line = max(seg_lens) * 1.0
            min_rem = max(min_rem, min_for_two_line)
        w = max(min_rem, min(32, 0.6 * column_max_len[i]))
        width_rem.append(w)
    colgroup_html = (
        "          <colgroup>\n"
        + "\n".join(f'            <col style="width: {width_rem[i]:.1f}rem">' for i in range(ncols))
        + "\n          </colgroup>"
    )

    best_pattern_name = "—"
    if evaluations:
        best_pattern_name = (
            evaluations[0].get("name")
            or evaluations[0].get("pattern_name")
            or (evaluations[0].get("rag_pattern") or {}).get("name", "—")
        )
        best_pattern_name = str(best_pattern_name)

    return _build_leaderboard_html(
        header_row=header_row,
        table_body=table_body,
        best_pattern_name=best_pattern_name,
        num_patterns=len(evaluations),
        eval_metric=optimization_metric or "faithfulness",
        colgroup_html=colgroup_html,
    )