def build_leaderboard_html( # pylint: disable=too-many-locals,too-many-branches,too-many-statements
patterns_dir: str | Path, optimization_metric: str = "faithfulness"
) -> str:
"""Build a styled HTML leaderboard from RAG pattern evaluation results.
Scans each subdirectory of *patterns_dir* for a ``pattern.json`` file,
ranks patterns by the specified optimisation metric, and produces a
self-contained HTML document with a responsive leaderboard table.
Parameters
----------
patterns_dir : str | Path
Directory of RAG patterns; each subdirectory must contain a
``pattern.json`` file.
optimization_metric : str, default="faithfulness"
Name of the metric used to rank patterns (e.g.
``"faithfulness"``, ``"answer_correctness"``).
Returns
-------
str
Self-contained HTML document with the leaderboard table.
Raises
------
FileNotFoundError
If *patterns_dir* does not exist or is not a directory.
"""
rag_patterns_dir = Path(patterns_dir)
if not rag_patterns_dir.is_dir():
raise FileNotFoundError(f"rag_patterns path is not a directory: {rag_patterns_dir}")
evaluations: list[dict] = []
for subdir in sorted(rag_patterns_dir.iterdir()):
if not subdir.is_dir():
continue
pattern_file = subdir / "pattern.json"
if not pattern_file.is_file():
continue
with pattern_file.open("r", encoding="utf-8") as f:
evaluations.append(json.load(f))
# Sort by optimization metric score descending; missing scores last
def _optimization_score(e: dict) -> tuple[bool, float]:
v = e.get("final_score")
if v is not None:
try:
return (False, -float(v))
except (TypeError, ValueError):
pass
raw = e.get("scores") or {}
aggregate = raw.get("scores") if isinstance(raw.get("scores"), dict) else raw
for _k, info in (aggregate or {}).items():
if isinstance(info, dict):
mean = info.get("mean")
if mean is not None:
try:
return (False, -float(mean))
except (TypeError, ValueError):
pass
return (True, 0)
evaluations.sort(key=_optimization_score)
# Discover metric columns present in data
all_metric_names: list[str] = []
for e in evaluations:
raw = e.get("scores") or {}
aggregate = raw.get("scores") if isinstance(raw.get("scores"), dict) else raw
for m in aggregate or {}:
if m not in all_metric_names:
all_metric_names.append(m)
metric_columns = [c for c in _LEADERBOARD_METRIC_COLUMNS if c.replace("mean_", "", 1) in all_metric_names]
for m in all_metric_names:
col = _metric_to_mean_key(m)
if col not in metric_columns:
metric_columns.append(col)
# Place the optimisation metric column right after Pattern_Name
opt_metric_col = _metric_to_mean_key(optimization_metric or "faithfulness")
if opt_metric_col in metric_columns:
other_metrics = [c for c in metric_columns if c != opt_metric_col]
metric_columns = [opt_metric_col] + other_metrics
config_columns = list(_LEADERBOARD_CONFIG_COLUMNS)
headers = ["Pattern_Name"] + metric_columns + config_columns
header_row = "".join(f"<th>{_header_two_lines(h)}</th>" for h in headers)
# Build rows and collect cell values for dynamic column width computation
rows_cells: list[list[str]] = []
rows: list[str] = []
for i, e in enumerate(evaluations):
pattern_name = e.get("name") or e.get("pattern_name") or (e.get("rag_pattern") or {}).get("name", "—")
raw = e.get("scores") or {}
scores = raw.get("scores") if isinstance(raw.get("scores"), dict) else raw
merged = (
_settings_from_rag_pattern(e)
or _normalize_flat_settings(e.get("settings"))
or _merge_params(e.get("indexing_params") or {}, e.get("rag_params") or {})
)
cells: list[str] = [str(pattern_name)]
for col in metric_columns:
metric_name = col.replace("mean_", "", 1)
info = scores.get(metric_name) or {}
mean = info.get("mean")
if mean is not None:
cell = f"{mean:.4f}" if isinstance(mean, (int, float)) else str(mean)
else:
cell = ""
cells.append(cell)
for col in config_columns:
val = _get_config_value(merged, col) if merged else None
if val is not None and (val != "" or col != "retrieval.ranker_strategy"):
if isinstance(val, dict):
cells.append(json.dumps(val, sort_keys=True))
else:
cells.append(str(val))
elif col == "retrieval.ranker_strategy":
cells.append("-")
else:
cells.append("")
rows_cells.append(cells)
tr_class = ' class="rank-1"' if i == 0 else ""
rows.append("<tr" + tr_class + ">" + "".join(f"<td>{html.escape(c)}</td>" for c in cells) + "</tr>")
table_body = "".join(rows)
# Dynamic column widths from content
ncols = len(headers)
column_max_len = [
max(
len(headers[i]),
max((len(rows_cells[r][i]) for r in range(len(rows_cells)))) if rows_cells else 0,
)
for i in range(ncols)
]
width_rem: list[float] = []
for i in range(ncols):
if i in (7, 12):
min_rem = 18 # embeddings.model_id, generation.model_id
elif i == 0:
min_rem = 10
elif i == 1:
min_rem = 12
else:
min_rem = 4
if "." in headers[i]:
parts = headers[i].split(".", 1)
line1_len = len(parts[0])
line2 = parts[1].replace("_", " ") if len(parts) > 1 else ""
if " " in line2:
last_space = line2.rfind(" ")
seg_lens = [line1_len, len(line2[:last_space]), len(line2[last_space + 1 :])]
else:
seg_lens = [line1_len, len(line2)] if line2 else [line1_len]
min_for_two_line = max(seg_lens) * 1.0
min_rem = max(min_rem, min_for_two_line)
w = max(min_rem, min(32, 0.6 * column_max_len[i]))
width_rem.append(w)
colgroup_html = (
" <colgroup>\n"
+ "\n".join(f' <col style="width: {width_rem[i]:.1f}rem">' for i in range(ncols))
+ "\n </colgroup>"
)
best_pattern_name = "—"
if evaluations:
best_pattern_name = (
evaluations[0].get("name")
or evaluations[0].get("pattern_name")
or (evaluations[0].get("rag_pattern") or {}).get("name", "—")
)
best_pattern_name = str(best_pattern_name)
return _build_leaderboard_html(
header_row=header_row,
table_body=table_body,
best_pattern_name=best_pattern_name,
num_patterns=len(evaluations),
eval_metric=optimization_metric or "faithfulness",
colgroup_html=colgroup_html,
)