From fa65497c50eb38c065b578acbfe29e6971c2b023 Mon Sep 17 00:00:00 2001
From: Antoine Richard <antoiner@nvidia.com>
Date: Thu, 28 May 2026 17:32:01 +0200
Subject: [PATCH 1/2] Add public v1.0 benchmark schema and recorder peak fields

Promote the JSON bundle schema produced by the standalone benchmark
scripts under scripts/benchmarks/ into a real public-API module,
isaaclab.benchmark.schema. Until now there was no single place in
lab that defined the shape of training.json / startup.json, even
though three lab scripts emit it and downstream tooling (e.g. the
in-tree Odin evaluation harness) is starting to consume it.

The module ships frozen dataclasses for TrainingBundle, StartupBundle,
and all their building blocks, plus a small write_bundle_file helper
that serialises any dataclass tree as schema-v1 JSON. The package
__init__ re-exports the public surface so callers can write
`from isaaclab.benchmark import TrainingBundle`.

This commit also extends GPUInfoRecorder and MemoryInfoRecorder to
report per-device peak alongside the existing mean/std rows. The
peak rows are always emitted (initialised to 0.0) so dashboards see
a consistent key set regardless of whether any sample was recorded.
Existing rows are unchanged.

The benchmark scripts themselves continue to use the legacy output
format on develop today; a follow-up PR rewrites them to emit
schema-v1 bundles directly via this module.
---
 .../antoiner-feat-benchmark-schema.rst        |  25 ++
 .../isaaclab/isaaclab/benchmark/__init__.py   |  66 +++++
 source/isaaclab/isaaclab/benchmark/schema.py  | 258 ++++++++++++++++++
 .../benchmark/recorders/record_gpu_info.py    |  28 +-
 .../benchmark/recorders/record_memory_info.py |  27 ++
 .../isaaclab/test/benchmark/test_recorders.py | 150 +++++++++-
 source/isaaclab/test/benchmark/test_schema.py | 175 ++++++++++++
 7 files changed, 721 insertions(+), 8 deletions(-)
 create mode 100644 source/isaaclab/changelog.d/antoiner-feat-benchmark-schema.rst
 create mode 100644 source/isaaclab/isaaclab/benchmark/__init__.py
 create mode 100644 source/isaaclab/isaaclab/benchmark/schema.py
 create mode 100644 source/isaaclab/test/benchmark/test_schema.py

diff --git a/source/isaaclab/changelog.d/antoiner-feat-benchmark-schema.rst b/source/isaaclab/changelog.d/antoiner-feat-benchmark-schema.rst
new file mode 100644
index 000000000000..77cb9ec5c44e
--- /dev/null
+++ b/source/isaaclab/changelog.d/antoiner-feat-benchmark-schema.rst
@@ -0,0 +1,25 @@
+Added
+^^^^^
+
+* Added :mod:`isaaclab.benchmark.schema`, the public v1.0 JSON schema for
+  benchmark bundles produced by the standalone scripts under
+  ``scripts/benchmarks/``. Exposes :class:`~isaaclab.benchmark.schema.TrainingBundle`
+  and :class:`~isaaclab.benchmark.schema.StartupBundle` plus the supporting
+  :class:`~isaaclab.benchmark.schema.Versions`, :class:`~isaaclab.benchmark.schema.Hardware`,
+  :class:`~isaaclab.benchmark.schema.Runtime`, :class:`~isaaclab.benchmark.schema.Resources`,
+  and :class:`~isaaclab.benchmark.schema.Learning` records, along with
+  :func:`~isaaclab.benchmark.schema.write_bundle_file` for emitting
+  schema-compliant JSON. The package root re-exports the same surface so
+  ``from isaaclab.benchmark import TrainingBundle`` works.
+
+Changed
+^^^^^^^
+
+* Extended :class:`~isaaclab.test.benchmark.recorders.GPUInfoRecorder` and the
+  system memory recorder to also report per-device **peak** memory and
+  utilisation alongside the existing mean/std rows. Existing rows are
+  unchanged; new rows are ``"Memory Used peak"``, ``"Utilization peak"``,
+  ``"System Memory RSS peak"``, ``"System Memory VMS peak"``, and
+  ``"System Memory USS peak"``. The peak rows are always emitted (initialised
+  to ``0.0``) so downstream consumers see consistent keys regardless of
+  whether any sample was recorded.
diff --git a/source/isaaclab/isaaclab/benchmark/__init__.py b/source/isaaclab/isaaclab/benchmark/__init__.py
new file mode 100644
index 000000000000..a1d4f4e1e5e4
--- /dev/null
+++ b/source/isaaclab/isaaclab/benchmark/__init__.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Public benchmark-bundle schema for Isaac Lab.
+
+The standalone benchmark scripts under ``scripts/benchmarks/`` emit
+self-contained JSON bundles described by the v1.0 schema in
+:mod:`isaaclab.benchmark.schema`. Importing from the package root works for
+the common types::
+
+    from isaaclab.benchmark import TrainingBundle, StartupBundle, write_bundle_file
+
+See :mod:`isaaclab.benchmark.schema` for the full set of dataclasses.
+"""
+
+from .schema import (
+    SCHEMA_VERSION,
+    Backend,
+    CProfileFunction,
+    Framework,
+    GpuDeviceInfo,
+    Hardware,
+    Learning,
+    LearningCurve,
+    MeanStd,
+    MeanStdPeak,
+    Resources,
+    RunIdentity,
+    RunStatus,
+    Runtime,
+    StartupBundle,
+    StartupConfig,
+    StartupPhase,
+    StartupPhaseTimes,
+    StartupRunIdentity,
+    TrainingBundle,
+    Versions,
+    write_bundle_file,
+)
+
+__all__ = [
+    "SCHEMA_VERSION",
+    "Backend",
+    "CProfileFunction",
+    "Framework",
+    "GpuDeviceInfo",
+    "Hardware",
+    "Learning",
+    "LearningCurve",
+    "MeanStd",
+    "MeanStdPeak",
+    "Resources",
+    "RunIdentity",
+    "RunStatus",
+    "Runtime",
+    "StartupBundle",
+    "StartupConfig",
+    "StartupPhase",
+    "StartupPhaseTimes",
+    "StartupRunIdentity",
+    "TrainingBundle",
+    "Versions",
+    "write_bundle_file",
+]
diff --git a/source/isaaclab/isaaclab/benchmark/schema.py b/source/isaaclab/isaaclab/benchmark/schema.py
new file mode 100644
index 000000000000..c961edc644c9
--- /dev/null
+++ b/source/isaaclab/isaaclab/benchmark/schema.py
@@ -0,0 +1,258 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Public schema for Isaac Lab benchmark bundles (v1.0).
+
+Defines the on-disk JSON schema produced by the standalone benchmark scripts
+under ``scripts/benchmarks/``: ``benchmark_startup.py``, ``benchmark_rsl_rl.py``,
+and ``benchmark_skrl.py``. Producers populate a :class:`TrainingBundle` or
+:class:`StartupBundle` and call :func:`write_bundle_file` to emit
+schema-compliant JSON. Consumers (dashboards, regression-comparison tools,
+the in-tree Odin evaluation harness under ``tools/odin/``) read the same file
+and reconstruct the dataclasses.
+
+Each bundle is self-contained: every top-level bundle carries its own
+:class:`Versions` and :class:`Hardware` metadata so a reader need not
+cross-reference other files in the bundle directory.
+
+Current version: 1.0
+"""
+
+from __future__ import annotations
+
+import dataclasses
+import json
+import os
+from dataclasses import dataclass
+from typing import Any, Literal
+
+SCHEMA_VERSION = "1.0"
+
+Framework = Literal["rsl_rl", "skrl"]
+Backend = Literal["physx", "newton"]
+RunStatus = Literal["completed", "interrupted", "crashed"]
+
+
+@dataclass(frozen=True)
+class MeanStd:
+    """Scalar with mean and standard deviation."""
+
+    mean: float
+    std: float
+
+
+@dataclass(frozen=True)
+class MeanStdPeak:
+    """Scalar with mean, standard deviation, and peak."""
+
+    mean: float
+    std: float
+    peak: float
+
+
+@dataclass(frozen=True)
+class GpuDeviceInfo:
+    """Information about a single GPU device."""
+
+    name: str
+    mem_gb: float
+    compute_cap: str
+
+
+@dataclass(frozen=True)
+class Hardware:
+    """Host hardware snapshot captured at run time."""
+
+    hostname: str
+    gpu_devices: list[GpuDeviceInfo]
+    cpu_name: str
+    cpu_count: int
+    ram_gb: float
+
+
+@dataclass(frozen=True)
+class Versions:
+    """Software versions captured at run time.
+
+    Framework-specific fields (``rsl_rl``, ``skrl``) are ``None`` when the
+    corresponding framework is not used by the run.
+    """
+
+    isaaclab: str
+    isaacsim: str | None
+    kit: str | None
+    newton: str | None
+    warp: str | None
+    mjwarp: str | None
+    torch: str
+    rsl_rl: str | None
+    skrl: str | None
+    git_commit: str | None
+    git_branch: str | None
+    git_dirty: bool
+
+
+@dataclass(frozen=True)
+class RunIdentity:
+    """Identity of a training run."""
+
+    run_id: str
+    framework: Framework
+    backend: Backend
+    task: str
+    seed: int
+    num_envs: int
+    max_iterations: int
+    start_time_utc: str
+    end_time_utc: str
+    duration_s: float
+    status: RunStatus
+
+
+@dataclass(frozen=True)
+class StartupPhaseTimes:
+    """Wall-clock duration of each startup phase [s]."""
+
+    app_launch: float
+    env_creation: float
+    first_step: float
+    python_imports: float | None = None
+    task_config: float | None = None
+
+
+@dataclass(frozen=True)
+class Runtime:
+    """Aggregated runtime metrics for a training run."""
+
+    startup_phase_times_s: StartupPhaseTimes
+    iterations_completed: int
+    total_wall_time_s: float
+    steps_per_iteration: int
+    iteration_time_s: MeanStd
+    env_steps_per_s: MeanStd
+    iterations_per_s: MeanStd
+
+
+@dataclass(frozen=True)
+class Resources:
+    """Aggregated resource utilisation metrics for a training run."""
+
+    gpu_util_pct: MeanStd
+    gpu_mem_gb: MeanStdPeak
+    cpu_util_pct: MeanStd
+    ram_gb: MeanStdPeak
+
+
+@dataclass(frozen=True)
+class LearningCurve:
+    """One learning curve (reward or episode length)."""
+
+    final_raw: float
+    final_ema: float
+    series_per_iter: list[float] | None
+
+
+@dataclass(frozen=True)
+class Learning:
+    """Learning curves for a training run, plus their EMA smoothing factor."""
+
+    ema_alpha: float
+    reward: LearningCurve
+    ep_length: LearningCurve
+
+
+@dataclass(frozen=True)
+class TrainingBundle:
+    """Top-level shape of ``training.json``."""
+
+    run: RunIdentity
+    versions: Versions
+    hardware: Hardware
+    runtime: Runtime
+    resources: Resources
+    learning: Learning
+    schema_version: str = SCHEMA_VERSION
+
+
+@dataclass(frozen=True)
+class CProfileFunction:
+    """One entry from a cProfile top-N table."""
+
+    name: str
+    own_time_s: float
+    cum_time_s: float
+    calls: int
+
+
+@dataclass(frozen=True)
+class StartupPhase:
+    """Wall-clock total plus top cProfile functions for one startup phase."""
+
+    total_time_s: float
+    top_functions: list[CProfileFunction]
+
+
+@dataclass(frozen=True)
+class StartupConfig:
+    """CLI configuration captured in a :class:`StartupBundle`."""
+
+    top_n: int
+    whitelist: str | None
+
+
+@dataclass(frozen=True)
+class StartupRunIdentity:
+    """Startup runs omit ``num_envs`` / ``max_iterations`` (not meaningful)."""
+
+    run_id: str
+    framework: Framework
+    backend: Backend
+    task: str
+    seed: int
+    start_time_utc: str
+    end_time_utc: str
+    duration_s: float
+    status: RunStatus
+
+
+@dataclass(frozen=True)
+class StartupBundle:
+    """Top-level shape of ``startup.json``."""
+
+    run: StartupRunIdentity
+    versions: Versions
+    hardware: Hardware
+    phases: dict[str, StartupPhase]
+    config: StartupConfig
+    schema_version: str = SCHEMA_VERSION
+
+
+def _to_plain(obj: Any) -> Any:
+    """Recursively convert dataclass instances to plain dicts/lists."""
+    if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
+        return {f.name: _to_plain(getattr(obj, f.name)) for f in dataclasses.fields(obj)}
+    if isinstance(obj, list):
+        return [_to_plain(x) for x in obj]
+    if isinstance(obj, dict):
+        return {k: _to_plain(v) for k, v in obj.items()}
+    return obj
+
+
+def write_bundle_file(bundle: Any, path: str) -> None:
+    """Write a bundle dataclass to disk as schema-v1 JSON.
+
+    Creates the parent directory if missing. Uses ``indent=2`` for readability;
+    payloads are small (~10 KB training.json, ~50 KB startup.json).
+
+    Args:
+        bundle: A dataclass instance to serialise. Typically
+            :class:`TrainingBundle` or :class:`StartupBundle`; any frozen
+            dataclass tree composed of primitives, lists, and dicts works.
+        path: Output file path.
+    """
+    os.makedirs(os.path.dirname(os.path.abspath(path)) or ".", exist_ok=True)
+    with open(path, "w") as f:
+        json.dump(_to_plain(bundle), f, indent=2, sort_keys=False)
+        f.write("\n")
diff --git a/source/isaaclab/isaaclab/test/benchmark/recorders/record_gpu_info.py b/source/isaaclab/isaaclab/test/benchmark/recorders/record_gpu_info.py
index 8ef734b53885..7e920cca42f6 100644
--- a/source/isaaclab/isaaclab/test/benchmark/recorders/record_gpu_info.py
+++ b/source/isaaclab/isaaclab/test/benchmark/recorders/record_gpu_info.py
@@ -37,6 +37,10 @@ def __init__(self):
         self._util_n = []
         self._util_m2 = []
 
+        # Per-device peak (running max) for memory (bytes) and utilization (%)
+        self._mem_peak = []
+        self._util_peak = []
+
         # pynvml device handles (one per GPU)
         self._handles = []
         self._nvml_available = False
@@ -75,6 +79,9 @@ def _get_hardware_info(self) -> None:
             self._util_std.append(0)
             self._util_n.append(0)
             self._util_m2.append(0)
+            # Peak state (running max)
+            self._mem_peak.append(0.0)
+            self._util_peak.append(0.0)
 
         # CUDA version
         with contextlib.suppress(Exception):
@@ -163,9 +170,11 @@ def _get_runtime_info(self) -> None:
             self._mem_m2[i] += delta * delta2
             if self._mem_n[i] > 1:
                 self._mem_std[i] = math.sqrt(self._mem_m2[i] / (self._mem_n[i] - 1))
+            self._mem_peak[i] = max(self._mem_peak[i], float(memory_bytes))
 
             self._gpu_runtime_info["devices"][i]["memory_used_mean_bytes"] = self._mem_mean[i]
             self._gpu_runtime_info["devices"][i]["memory_used_std_bytes"] = self._mem_std[i]
+            self._gpu_runtime_info["devices"][i]["memory_used_peak_bytes"] = self._mem_peak[i]
             self._gpu_runtime_info["devices"][i]["memory_n"] = self._mem_n[i]
 
             # GPU utilization from pynvml or nvidia-smi fallback
@@ -189,9 +198,11 @@ def _get_runtime_info(self) -> None:
                 self._util_m2[i] += delta * delta2
                 if self._util_n[i] > 1:
                     self._util_std[i] = math.sqrt(self._util_m2[i] / (self._util_n[i] - 1))
+                self._util_peak[i] = max(self._util_peak[i], float(gpu_util))
 
                 self._gpu_runtime_info["devices"][i]["utilization_mean_percent"] = self._util_mean[i]
                 self._gpu_runtime_info["devices"][i]["utilization_std_percent"] = self._util_std[i]
+                self._gpu_runtime_info["devices"][i]["utilization_peak_percent"] = self._util_peak[i]
                 self._gpu_runtime_info["devices"][i]["utilization_n"] = self._util_n[i]
 
     def update(self) -> None:
@@ -252,7 +263,7 @@ def get_data(self) -> MeasurementData:
                 runtime = device_runtime[i]
                 prefix = f"GPU {i} " if self._device_count > 1 else "GPU "
 
-                # Memory used
+                # Memory used (mean/std/n only when updates have been recorded)
                 if "memory_used_mean_bytes" in runtime:
                     measurements.append(
                         SingleMeasurement(
@@ -275,6 +286,14 @@ def get_data(self) -> MeasurementData:
                             unit="",
                         )
                     )
+                # Peak is always emitted (initialised to 0.0, rises on first update)
+                measurements.append(
+                    SingleMeasurement(
+                        name=f"{prefix}Memory Used peak",
+                        value=self._bytes_to_gb(self._mem_peak[i]),
+                        unit="GB",
+                    )
+                )
 
                 # GPU Utilization
                 if "utilization_mean_percent" in runtime:
@@ -292,6 +311,13 @@ def get_data(self) -> MeasurementData:
                             unit="%",
                         )
                     )
+                    measurements.append(
+                        SingleMeasurement(
+                            name=f"{prefix}Utilization peak",
+                            value=round(runtime.get("utilization_peak_percent", 0), 2),
+                            unit="%",
+                        )
+                    )
                     measurements.append(
                         SingleMeasurement(
                             name=f"{prefix}Utilization n",
diff --git a/source/isaaclab/isaaclab/test/benchmark/recorders/record_memory_info.py b/source/isaaclab/isaaclab/test/benchmark/recorders/record_memory_info.py
index 8ad0f54304e3..8e0a786721ea 100644
--- a/source/isaaclab/isaaclab/test/benchmark/recorders/record_memory_info.py
+++ b/source/isaaclab/isaaclab/test/benchmark/recorders/record_memory_info.py
@@ -33,6 +33,12 @@ def __init__(self):
         self._uss_m2 = 0
         self._uss_n = 0
 
+        # Peak (running max) alongside the Welford mean/std. Initialised to
+        # 0.0 so emit-before-record returns a meaningful zero.
+        self._rss_peak = 0.0
+        self._vms_peak = 0.0
+        self._uss_peak = 0.0
+
         # Process handle
         self._process = psutil.Process(os.getpid())
         self._get_hardware_info()
@@ -62,17 +68,21 @@ def _get_runtime_info(self) -> None:
         self._rss_mean, self._rss_m2, self._rss_n, rss_std = self._update_welford(
             mem_info.rss, self._rss_mean, self._rss_m2, self._rss_n
         )
+        self._rss_peak = max(self._rss_peak, float(mem_info.rss))
         self._memory_runtime_info["rss_mean"] = self._rss_mean
         self._memory_runtime_info["rss_std"] = rss_std
         self._memory_runtime_info["rss_n"] = self._rss_n
+        self._memory_runtime_info["rss_peak"] = self._rss_peak
 
         # VMS (Virtual Memory Size) - total virtual memory
         self._vms_mean, self._vms_m2, self._vms_n, vms_std = self._update_welford(
             mem_info.vms, self._vms_mean, self._vms_m2, self._vms_n
         )
+        self._vms_peak = max(self._vms_peak, float(mem_info.vms))
         self._memory_runtime_info["vms_mean"] = self._vms_mean
         self._memory_runtime_info["vms_std"] = vms_std
         self._memory_runtime_info["vms_n"] = self._vms_n
+        self._memory_runtime_info["vms_peak"] = self._vms_peak
 
         # USS (Unique Set Size) - memory unique to process (not shared)
         try:
@@ -80,9 +90,11 @@ def _get_runtime_info(self) -> None:
             self._uss_mean, self._uss_m2, self._uss_n, uss_std = self._update_welford(
                 uss, self._uss_mean, self._uss_m2, self._uss_n
             )
+            self._uss_peak = max(self._uss_peak, float(uss))
             self._memory_runtime_info["uss_mean"] = self._uss_mean
             self._memory_runtime_info["uss_std"] = uss_std
             self._memory_runtime_info["uss_n"] = self._uss_n
+            self._memory_runtime_info["uss_peak"] = self._uss_peak
         except (psutil.AccessDenied, AttributeError):
             # USS may not be available on all platforms
             pass
@@ -117,6 +129,11 @@ def get_data(self) -> MeasurementData:
                 value=self._bytes_to_gb(self._memory_runtime_info.get("rss_std", 0)),
                 unit="GB",
             ),
+            SingleMeasurement(
+                name="System Memory RSS peak",
+                value=self._bytes_to_gb(self._memory_runtime_info.get("rss_peak", 0)),
+                unit="GB",
+            ),
             SingleMeasurement(name="System Memory RSS n", value=self._memory_runtime_info.get("rss_n", 0), unit=""),
             # VMS (Virtual Memory Size)
             SingleMeasurement(
@@ -129,6 +146,11 @@ def get_data(self) -> MeasurementData:
                 value=self._bytes_to_gb(self._memory_runtime_info.get("vms_std", 0)),
                 unit="GB",
             ),
+            SingleMeasurement(
+                name="System Memory VMS peak",
+                value=self._bytes_to_gb(self._memory_runtime_info.get("vms_peak", 0)),
+                unit="GB",
+            ),
             SingleMeasurement(name="System Memory VMS n", value=self._memory_runtime_info.get("vms_n", 0), unit=""),
         ]
 
@@ -146,6 +168,11 @@ def get_data(self) -> MeasurementData:
                         value=self._bytes_to_gb(self._memory_runtime_info.get("uss_std", 0)),
                         unit="GB",
                     ),
+                    SingleMeasurement(
+                        name="System Memory USS peak",
+                        value=self._bytes_to_gb(self._memory_runtime_info.get("uss_peak", 0)),
+                        unit="GB",
+                    ),
                     SingleMeasurement(
                         name="System Memory USS n", value=self._memory_runtime_info.get("uss_n", 0), unit=""
                     ),
diff --git a/source/isaaclab/test/benchmark/test_recorders.py b/source/isaaclab/test/benchmark/test_recorders.py
index fc519bb051d8..0cdb5f556b4c 100644
--- a/source/isaaclab/test/benchmark/test_recorders.py
+++ b/source/isaaclab/test/benchmark/test_recorders.py
@@ -236,10 +236,12 @@ def test_get_data_returns_measurement_data(self, recorder):
 
         measurement_data = recorder.get_data()
         assert isinstance(measurement_data, MeasurementData)
-        # GPU data includes measurements (memory and utilization stats)
-        # 6 measurements per GPU: memory (mean, std, n) + utilization (mean, std, n)
+        # GPU data includes measurements per GPU:
+        #   memory: mean, std, n + peak (always) = 4
+        #   utilization: mean, std, peak, n (when nvml/smi available) = 4
+        # Total is 4 (memory-only) or 8 (memory + utilization) per GPU.
         num_gpus = data["gpu_metadata"]["device_count"]
-        assert len(measurement_data.measurements) == 6 * num_gpus
+        assert len(measurement_data.measurements) in (4 * num_gpus, 8 * num_gpus)
         # 4 metadata entries: device_count, current_device, cuda_version, gpu_devices dict
         assert len(measurement_data.metadata) == 4
 
@@ -287,6 +289,69 @@ def test_get_data_devices_dict_structure(self, recorder):
         assert "compute_capability" in device_0
         assert "multi_processor_count" in device_0
 
+    def test_mem_peak_is_zero_before_any_record(self, monkeypatch):
+        """Peak memory row for device 0 exists and is 0.0 before any update."""
+        import torch
+
+        from isaaclab.test.benchmark.recorders.record_gpu_info import GPUInfoRecorder
+
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
+        monkeypatch.setattr(torch.cuda, "device_count", lambda: 1)
+        monkeypatch.setattr(torch.cuda, "current_device", lambda: 0)
+
+        class _FakeProps:
+            name = "FakeGPU"
+            total_memory = 80 * 1024**3
+            major = 9
+            minor = 0
+            multi_processor_count = 132
+
+        monkeypatch.setattr(torch.cuda, "get_device_properties", lambda i: _FakeProps())
+
+        rec = GPUInfoRecorder()
+        data = rec.get_data()
+        peaks = [m for m in data.measurements if "peak" in m.name.lower() and "GPU" in m.name]
+        # At minimum there should be a GPU memory peak row for device 0.
+        mem_peak_rows = [m for m in peaks if "Memory" in m.name]
+        assert mem_peak_rows, f"expected a GPU memory peak row, got names: {[m.name for m in data.measurements]}"
+        assert mem_peak_rows[0].value == 0.0
+
+    def test_mem_peak_tracks_running_max(self, monkeypatch):
+        """Feed the recorder a scripted memory sequence; peak must match the max."""
+        import torch
+
+        from isaaclab.test.benchmark.recorders.record_gpu_info import GPUInfoRecorder
+
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
+        monkeypatch.setattr(torch.cuda, "device_count", lambda: 1)
+        monkeypatch.setattr(torch.cuda, "current_device", lambda: 0)
+
+        class _FakeProps:
+            name = "FakeGPU"
+            total_memory = 80 * 1024**3
+            major = 9
+            minor = 0
+            multi_processor_count = 132
+
+        monkeypatch.setattr(torch.cuda, "get_device_properties", lambda i: _FakeProps())
+
+        rec = GPUInfoRecorder()
+
+        # Bypass nvml / nvidia-smi entirely and drive memory_allocated.
+        scripted_mem = iter([10 * 1024**3, 50 * 1024**3, 30 * 1024**3])  # 10 GB, 50 GB, 30 GB
+        monkeypatch.setattr(torch.cuda, "memory_allocated", lambda i: next(scripted_mem))
+        rec._nvml_available = False
+        rec._nvidia_smi_available = False
+
+        for _ in range(3):
+            rec.update()
+
+        data = rec.get_data()
+        mem_peak_rows = [m for m in data.measurements if "Memory" in m.name and "peak" in m.name.lower()]
+        assert mem_peak_rows, "expected a GPU memory peak row"
+        # 50 GB is the max.
+        assert mem_peak_rows[0].value == 50.0, f"expected 50.0 GB peak, got {mem_peak_rows[0].value}"
+
 
 # ==============================================================================
 # MemoryInfoRecorder Tests
@@ -392,10 +457,10 @@ def test_get_data_returns_measurement_data(self, recorder):
 
         data = recorder.get_data()
         assert isinstance(data, MeasurementData)
-        # 6 measurements for RSS and VMS (mean, std, n for each)
-        # Plus potentially 3 more for USS if available (mean, std, n)
-        assert len(data.measurements) >= 6
-        assert len(data.measurements) <= 9
+        # 8 measurements for RSS and VMS (mean, std, peak, n for each)
+        # Plus potentially 4 more for USS if available (mean, std, peak, n)
+        assert len(data.measurements) >= 8
+        assert len(data.measurements) <= 12
         assert len(data.metadata) == 1
 
     def test_get_data_measurement_names(self, recorder):
@@ -423,6 +488,77 @@ def test_get_data_metadata_names(self, recorder):
         names = [m.name for m in data.metadata]
         assert "total_ram_gb" in names
 
+    def test_rss_peak_is_zero_before_any_record(self):
+        """Test that RSS peak is 0.0 before any update has been called."""
+        from isaaclab.test.benchmark.recorders.record_memory_info import MemoryInfoRecorder
+
+        rec = MemoryInfoRecorder()
+        data = rec.get_data()
+        peak_rows = [m for m in data.measurements if m.name == "System Memory RSS peak"]
+        assert peak_rows, "expected a 'System Memory RSS peak' SingleMeasurement"
+        assert peak_rows[0].value == 0.0
+
+    def test_rss_peak_tracks_running_max(self, monkeypatch):
+        """Test that RSS peak tracks the running maximum across updates."""
+        import psutil
+
+        from isaaclab.test.benchmark.recorders.record_memory_info import MemoryInfoRecorder
+
+        # Scripted RSS sequence; peak must equal the max seen so far.
+        scripted_values = [100 * 1024**3, 200 * 1024**3, 150 * 1024**3]  # bytes
+        scripted_iter = iter(scripted_values)
+
+        class _FakeMemInfo:
+            def __init__(self, rss):
+                self.rss = rss
+                self.vms = rss  # mirror so VMS also moves
+                # USS is read via memory_full_info, not memory_info; leave alone.
+
+        def _fake_memory_info(self):  # noqa: ARG001 — bound method, self is the process
+            return _FakeMemInfo(next(scripted_iter))
+
+        monkeypatch.setattr(psutil.Process, "memory_info", _fake_memory_info)
+
+        rec = MemoryInfoRecorder()
+        for _ in scripted_values:
+            rec.update()
+
+        data = rec.get_data()
+        rss_peak = next(m for m in data.measurements if m.name == "System Memory RSS peak")
+        # The recorder emits GB; input was in bytes. 200 GiB -> 200.0 after rounding.
+        assert rss_peak.value == 200.0, f"expected peak=200.0 GB, got {rss_peak.value}"
+
+        vms_peak = next(m for m in data.measurements if m.name == "System Memory VMS peak")
+        assert vms_peak.value == 200.0
+
+    def test_rss_peak_does_not_decrease(self, monkeypatch):
+        """Test that RSS peak does not decrease when memory usage drops."""
+        import psutil
+
+        from isaaclab.test.benchmark.recorders.record_memory_info import MemoryInfoRecorder
+
+        # Decreasing sequence — peak is set by the first sample and then stays.
+        scripted_values = [300 * 1024**3, 50 * 1024**3, 25 * 1024**3]
+        scripted_iter = iter(scripted_values)
+
+        class _FakeMemInfo:
+            def __init__(self, rss):
+                self.rss = rss
+                self.vms = rss
+
+        def _fake_memory_info(self):  # noqa: ARG001
+            return _FakeMemInfo(next(scripted_iter))
+
+        monkeypatch.setattr(psutil.Process, "memory_info", _fake_memory_info)
+
+        rec = MemoryInfoRecorder()
+        for _ in scripted_values:
+            rec.update()
+
+        data = rec.get_data()
+        rss_peak = next(m for m in data.measurements if m.name == "System Memory RSS peak")
+        assert rss_peak.value == 300.0
+
 
 # ==============================================================================
 # VersionInfoRecorder Tests
diff --git a/source/isaaclab/test/benchmark/test_schema.py b/source/isaaclab/test/benchmark/test_schema.py
new file mode 100644
index 000000000000..2d17975a44a7
--- /dev/null
+++ b/source/isaaclab/test/benchmark/test_schema.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Tests for the v1.0 Isaac Lab benchmark schema."""
+
+import dataclasses
+import json
+import os
+
+import pytest
+
+from isaaclab.benchmark.schema import (
+    SCHEMA_VERSION,
+    CProfileFunction,
+    GpuDeviceInfo,
+    Hardware,
+    Learning,
+    LearningCurve,
+    MeanStd,
+    MeanStdPeak,
+    Resources,
+    RunIdentity,
+    Runtime,
+    StartupBundle,
+    StartupConfig,
+    StartupPhase,
+    StartupPhaseTimes,
+    StartupRunIdentity,
+    TrainingBundle,
+    Versions,
+    write_bundle_file,
+)
+
+
+def _minimal_training_bundle() -> TrainingBundle:
+    """Construct a valid TrainingBundle with placeholder numeric values."""
+    return TrainingBundle(
+        run=RunIdentity(
+            run_id="rsl-rl_physx_Isaac-Ant-Direct-v0_20260422-131500_seed42",
+            framework="rsl_rl",
+            backend="physx",
+            task="Isaac-Ant-Direct-v0",
+            seed=42,
+            num_envs=4096,
+            max_iterations=500,
+            start_time_utc="2026-04-22T13:15:00Z",
+            end_time_utc="2026-04-22T13:47:22Z",
+            duration_s=1942.1,
+            status="completed",
+        ),
+        versions=Versions(
+            isaaclab="4.6.8",
+            isaacsim="5.0.0",
+            kit="107.1.0",
+            newton="0.1.2",
+            warp="1.7.3",
+            mjwarp="0.0.4",
+            torch="2.5.1",
+            rsl_rl="2.3.0",
+            skrl=None,
+            git_commit="3d42b11d513",
+            git_branch="develop",
+            git_dirty=False,
+        ),
+        hardware=Hardware(
+            hostname="benchmark-host",
+            gpu_devices=[GpuDeviceInfo(name="NVIDIA H100 80GB", mem_gb=80.0, compute_cap="9.0")],
+            cpu_name="AMD EPYC 7763",
+            cpu_count=64,
+            ram_gb=512.0,
+        ),
+        runtime=Runtime(
+            startup_phase_times_s=StartupPhaseTimes(app_launch=18.4, env_creation=22.9, first_step=4.1),
+            iterations_completed=500,
+            total_wall_time_s=1946.0,
+            steps_per_iteration=24,
+            iteration_time_s=MeanStd(mean=3.82, std=0.04),
+            env_steps_per_s=MeanStd(mean=1_071_780.0, std=11_200.0),
+            iterations_per_s=MeanStd(mean=0.2618, std=0.0028),
+        ),
+        resources=Resources(
+            gpu_util_pct=MeanStd(mean=87.2, std=6.1),
+            gpu_mem_gb=MeanStdPeak(mean=18.4, std=0.3, peak=19.2),
+            cpu_util_pct=MeanStd(mean=31.5, std=4.8),
+            ram_gb=MeanStdPeak(mean=22.1, std=0.4, peak=24.8),
+        ),
+        learning=Learning(
+            ema_alpha=0.05,
+            reward=LearningCurve(final_raw=1823.4, final_ema=1796.1, series_per_iter=[12.3, 34.5, 58.1]),
+            ep_length=LearningCurve(final_raw=987.0, final_ema=962.3, series_per_iter=[4.1, 5.0, 7.2]),
+        ),
+    )
+
+
+def test_training_bundle_round_trip(tmp_path):
+    """Writing a TrainingBundle and reloading via json gives back identical data."""
+    bundle = _minimal_training_bundle()
+    path = os.path.join(tmp_path, "training.json")
+    write_bundle_file(bundle, path)
+
+    with open(path) as f:
+        data = json.load(f)
+
+    assert data["schema_version"] == SCHEMA_VERSION
+    assert data["run"]["run_id"] == bundle.run.run_id
+    assert data["runtime"]["env_steps_per_s"]["mean"] == pytest.approx(1_071_780.0)
+    assert data["resources"]["ram_gb"]["peak"] == pytest.approx(24.8)
+    assert data["learning"]["reward"]["series_per_iter"] == [12.3, 34.5, 58.1]
+    assert data["versions"]["skrl"] is None
+
+
+def test_training_bundle_without_series(tmp_path):
+    """With series_per_iter=None, the JSON contains an explicit null."""
+    bundle = _minimal_training_bundle()
+    bundle_no_series = dataclasses.replace(
+        bundle,
+        learning=Learning(
+            ema_alpha=0.05,
+            reward=LearningCurve(final_raw=1.0, final_ema=1.0, series_per_iter=None),
+            ep_length=LearningCurve(final_raw=1.0, final_ema=1.0, series_per_iter=None),
+        ),
+    )
+    path = os.path.join(tmp_path, "training.json")
+    write_bundle_file(bundle_no_series, path)
+    with open(path) as f:
+        data = json.load(f)
+    assert data["learning"]["reward"]["series_per_iter"] is None
+    assert data["learning"]["ep_length"]["series_per_iter"] is None
+
+
+def test_startup_bundle_round_trip(tmp_path):
+    """StartupBundle round-trips with phase dict and top-function lists."""
+    bundle = StartupBundle(
+        run=StartupRunIdentity(
+            run_id="rsl-rl_physx_Isaac-Ant-Direct-v0_20260422-131500_seed42",
+            framework="rsl_rl",
+            backend="physx",
+            task="Isaac-Ant-Direct-v0",
+            seed=42,
+            start_time_utc="2026-04-22T13:15:00Z",
+            end_time_utc="2026-04-22T13:15:48Z",
+            duration_s=48.7,
+            status="completed",
+        ),
+        versions=_minimal_training_bundle().versions,
+        hardware=_minimal_training_bundle().hardware,
+        phases={
+            "app_launch": StartupPhase(
+                total_time_s=18.4,
+                top_functions=[CProfileFunction(name="isaaclab.x:y", own_time_s=1.8, cum_time_s=2.4, calls=4312)],
+            ),
+            "env_creation": StartupPhase(total_time_s=22.9, top_functions=[]),
+            "first_step": StartupPhase(total_time_s=4.1, top_functions=[]),
+        },
+        config=StartupConfig(top_n=30, whitelist="startup_whitelist.yaml"),
+    )
+    path = os.path.join(tmp_path, "startup.json")
+    write_bundle_file(bundle, path)
+    with open(path) as f:
+        data = json.load(f)
+    assert data["phases"]["app_launch"]["total_time_s"] == pytest.approx(18.4)
+    assert data["phases"]["app_launch"]["top_functions"][0]["calls"] == 4312
+
+
+def test_package_reexports_match_schema_module():
+    """`from isaaclab.benchmark import ...` resolves to the same objects as
+    `from isaaclab.benchmark.schema import ...`. Keeps the convenience
+    namespace honest if someone forgets to update __all__."""
+    import isaaclab.benchmark as pkg
+    from isaaclab.benchmark import schema
+
+    for name in pkg.__all__:
+        assert getattr(pkg, name) is getattr(schema, name), name

From d9d07a207b861676a342136383d0aab66746f490 Mon Sep 17 00:00:00 2001
From: Antoine Richard <antoiner@nvidia.com>
Date: Thu, 28 May 2026 17:49:17 +0200
Subject: [PATCH 2/2] Emit v1.0 schema bundles from benchmark scripts

Wire the three standalone benchmark scripts under scripts/benchmarks/
to emit self-contained JSON bundles conforming to the v1.0 schema
added in the previous commit (isaaclab.benchmark.schema):

- benchmark_startup.py now optionally writes a StartupBundle to the
  path given by --schema_v1_output, with per-phase cProfile top-N
  data and total durations.
- benchmark_rsl_rl.py now optionally writes a TrainingBundle with
  the run identity, captured versions/hardware, aggregated runtime
  and resource metrics, and EMA-smoothed reward / episode-length
  curves. The EMA factor is configurable via --ema_alpha; --no_series
  drops the full per-iteration curves and keeps only the scalars.
- benchmark_skrl.py is new: a SKRL-framework counterpart that emits
  the same TrainingBundle with framework set to "skrl". Pairs with a
  small skrl_benchmark_trainer subclass that exposes per-iteration
  reward / episode-length values to the script without touching
  upstream skrl.

The legacy per-backend output format remains the default when
--schema_v1_output is omitted, so existing CI and ad-hoc invocations
keep working unchanged.

Shared helpers (_action_sampling.sample_random_actions to keep
single-agent + multi-agent benchmark startup working, _schema_helpers
to build Versions/Hardware from the recorder metadata and synthesise
a fallback run_id) live alongside the scripts.

utils.parse_cprofile_stats now returns ncalls as a fourth tuple
element so the schema's CProfileFunction.calls field can be populated.

Updated startup_whitelist.yaml to track the IsaacLab v3 configclass /
cloner / scene-init call paths and explicitly fall through to top_n
for python_imports and first_step (per file comments).

Added scripts/benchmarks/tests/ covering the new helpers and CLI
surfaces, plus source/isaaclab/test/benchmark/test_parse_cprofile_stats.py
for the ncalls extension. Added docs/source/features/benchmarking.rst
documenting the scripts and the schema.
---
 docs/index.rst                                |   1 +
 docs/source/features/benchmarking.rst         | 110 ++++
 scripts/benchmarks/_action_sampling.py        |  59 ++
 scripts/benchmarks/_schema_helpers.py         |  83 +++
 scripts/benchmarks/benchmark_rsl_rl.py        | 350 ++++++++++-
 scripts/benchmarks/benchmark_skrl.py          | 562 ++++++++++++++++++
 scripts/benchmarks/benchmark_startup.py       | 140 ++++-
 scripts/benchmarks/skrl_benchmark_trainer.py  | 144 +++++
 scripts/benchmarks/startup_whitelist.yaml     |  47 +-
 scripts/benchmarks/tests/__init__.py          |   4 +
 .../benchmarks/tests/test_action_sampling.py  | 169 ++++++
 .../tests/test_benchmark_rsl_rl_cli.py        | 201 +++++++
 .../tests/test_benchmark_skrl_cli.py          | 234 ++++++++
 .../tests/test_skrl_benchmark_trainer.py      | 232 ++++++++
 scripts/benchmarks/utils.py                   |  26 +-
 .../antoiner-feat-benchmark-scripts-v1.rst    |  26 +
 .../benchmark/test_parse_cprofile_stats.py    |  95 +++
 17 files changed, 2436 insertions(+), 47 deletions(-)
 create mode 100644 docs/source/features/benchmarking.rst
 create mode 100644 scripts/benchmarks/_action_sampling.py
 create mode 100644 scripts/benchmarks/_schema_helpers.py
 create mode 100644 scripts/benchmarks/benchmark_skrl.py
 create mode 100644 scripts/benchmarks/skrl_benchmark_trainer.py
 create mode 100644 scripts/benchmarks/tests/__init__.py
 create mode 100644 scripts/benchmarks/tests/test_action_sampling.py
 create mode 100644 scripts/benchmarks/tests/test_benchmark_rsl_rl_cli.py
 create mode 100644 scripts/benchmarks/tests/test_benchmark_skrl_cli.py
 create mode 100644 scripts/benchmarks/tests/test_skrl_benchmark_trainer.py
 create mode 100644 source/isaaclab/changelog.d/antoiner-feat-benchmark-scripts-v1.rst
 create mode 100644 source/isaaclab/test/benchmark/test_parse_cprofile_stats.py

diff --git a/docs/index.rst b/docs/index.rst
index 51da7b86d9ff..f5c41742a9dd 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -134,6 +134,7 @@ Table of Contents
    Tiled Rendering</source/overview/core-concepts/sensors/camera>
    source/features/ray
    source/features/reproducibility
+   source/features/benchmarking
 
 
 .. toctree::
diff --git a/docs/source/features/benchmarking.rst b/docs/source/features/benchmarking.rst
new file mode 100644
index 000000000000..e914c88c6298
--- /dev/null
+++ b/docs/source/features/benchmarking.rst
@@ -0,0 +1,110 @@
+Benchmarking
+============
+
+Isaac Lab ships three standalone benchmark scripts that emit a common
+``v1.0`` JSON schema for training-performance and startup-performance data.
+The schema is defined in :mod:`isaaclab.benchmark.schema`, and the scripts
+are fully usable standalone — any tool that can read JSON can consume the
+output.
+
+.. contents::
+   :local:
+   :depth: 2
+
+
+Scripts
+-------
+
+``benchmark_startup.py``
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Profiles five IsaacLab startup phases with ``cProfile``: ``app_launch``,
+``python_imports``, ``task_config``, ``env_creation``, and ``first_step``. For
+each phase it records wall-clock time and the top N self-time functions.
+
+.. code-block:: bash
+
+   ./isaaclab.sh -p scripts/benchmarks/benchmark_startup.py \
+       --task Isaac-Ant-Direct-v0 --num_envs 4096 --headless \
+       --schema_v1_output /tmp/startup.json
+
+``benchmark_rsl_rl.py``
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Trains a task with the RSL-RL PPO agent and records runtime / resource /
+learning metrics, including exponentially-smoothed reward and episode-length
+curves.
+
+.. code-block:: bash
+
+   ./isaaclab.sh -p scripts/benchmarks/benchmark_rsl_rl.py \
+       --task Isaac-Ant-Direct-v0 --num_envs 4096 \
+       --max_iterations 500 --headless \
+       --schema_v1_output /tmp/training.json
+
+``benchmark_skrl.py``
+~~~~~~~~~~~~~~~~~~~~~
+
+The SKRL-framework counterpart to ``benchmark_rsl_rl.py``. Emits the same
+schema with ``framework: "skrl"``.
+
+.. code-block:: bash
+
+   ./isaaclab.sh -p scripts/benchmarks/benchmark_skrl.py \
+       --task Isaac-Ant-Direct-v0 --num_envs 4096 \
+       --max_iterations 500 --headless \
+       --schema_v1_output /tmp/training_skrl.json
+
+
+v1.0 schema summary
+-------------------
+
+Each script writes a single self-contained JSON file. The shape is defined by
+dataclasses in :mod:`isaaclab.benchmark.schema` — refer to the module for
+per-field units and descriptions.
+
+:class:`~isaaclab.benchmark.schema.TrainingBundle` (training scripts)
+top-level keys:
+
+* ``run`` — run identity (``run_id``, ``framework``, ``backend``, ``task``,
+  ``seed``, ``num_envs``, ``max_iterations``, timestamps, ``status``).
+* ``versions`` — software versions at run time (Isaac Lab, Isaac Sim, Kit,
+  Newton, Warp, Torch, RSL-RL / SKRL, git metadata).
+* ``hardware`` — host snapshot (hostname, GPU devices, CPU, RAM).
+* ``runtime`` — aggregated timings (``iterations_completed``,
+  ``iteration_time_s``, ``env_steps_per_s``, ``iterations_per_s``,
+  ``startup_phase_times_s``).
+* ``resources`` — aggregated GPU/CPU/RAM utilisation (mean/std/peak).
+* ``learning`` — final-value and EMA-smoothed reward / episode-length curves,
+  with full per-iteration series unless ``--no_series`` is passed.
+
+:class:`~isaaclab.benchmark.schema.StartupBundle` (``benchmark_startup.py``)
+replaces ``runtime`` / ``resources`` / ``learning`` with:
+
+* ``phases`` — mapping from phase name to ``{total_time_s, top_functions}``.
+* ``config`` — CLI configuration (``top_n``, ``whitelist``).
+
+
+Common CLI flags
+----------------
+
+``--schema_v1_output <path>``
+    Write the v1.0 JSON bundle to this path. If omitted, the script falls
+    back to the legacy per-backend output format.
+
+``--backend {physx, newton}``
+    Physics backend tag recorded in the bundle. Defaults to ``physx`` if
+    omitted.
+
+``--run_id <string>``
+    Explicit run-identity string. If omitted, a synthetic run_id of the
+    form ``<framework>_<backend>_<task>_<YYYYMMDD-HHMMSS>_seed<seed>`` is
+    generated.
+
+``--ema_alpha <float>`` (training scripts)
+    EMA smoothing factor for reward / episode-length curves (default
+    ``0.05``, roughly a 20-sample window).
+
+``--no_series`` (training scripts)
+    Omit per-iteration series from the bundle, leaving only the
+    ``final_raw`` + ``final_ema`` scalars.
diff --git a/scripts/benchmarks/_action_sampling.py b/scripts/benchmarks/_action_sampling.py
new file mode 100644
index 000000000000..1305cb915c78
--- /dev/null
+++ b/scripts/benchmarks/_action_sampling.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Random-action sampler shared across the benchmark scripts.
+
+Single-agent (``DirectRLEnv`` / ``ManagerBasedRLEnv``) envs expose
+``single_action_space``; multi-agent (``DirectMARLEnv``) envs expose
+``action_spaces`` — a dict keyed by agent id. ``env.step`` accepts the
+matching shape: a stacked tensor for single-agent, a dict of stacked
+tensors for multi-agent. The benchmark startup phase needs random
+actions for the first env step and previously assumed single-agent;
+this helper picks the right shape.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+import torch
+
+__all__ = ["sample_random_actions"]
+
+
+def sample_random_actions(env: Any) -> torch.Tensor | dict[str, torch.Tensor]:
+    """Sample one random action per env from the env's action space(s).
+
+    Discriminates single-agent from multi-agent by duck typing on
+    ``action_spaces`` (plural, dict-valued). DirectRLEnv and
+    ManagerBasedRLEnv expose ``single_action_space``; DirectMARLEnv
+    exposes ``action_spaces``. Both shapes ultimately get fed straight
+    to ``env.step``.
+
+    Args:
+        env: The benchmark target — typically a ``gym.Env`` returned by
+            ``gym.make``. The unwrapped env must expose ``num_envs``
+            and ``device`` plus either ``single_action_space`` or
+            ``action_spaces``.
+
+    Returns:
+        A ``torch.Tensor`` of shape ``(num_envs, action_dim)`` for
+        single-agent envs, or a dict ``{agent: tensor}`` for
+        multi-agent envs. dtype is ``torch.float32`` on the env's
+        device.
+    """
+    unwrapped = env.unwrapped
+    if hasattr(unwrapped, "action_spaces"):
+        return {
+            agent: torch.as_tensor(
+                np.stack([space.sample() for _ in range(unwrapped.num_envs)]),
+                dtype=torch.float32,
+                device=unwrapped.device,
+            )
+            for agent, space in unwrapped.action_spaces.items()
+        }
+    np_actions = np.stack([unwrapped.single_action_space.sample() for _ in range(unwrapped.num_envs)])
+    return torch.as_tensor(np_actions, dtype=torch.float32, device=unwrapped.device)
diff --git a/scripts/benchmarks/_schema_helpers.py b/scripts/benchmarks/_schema_helpers.py
new file mode 100644
index 000000000000..c794c07fb200
--- /dev/null
+++ b/scripts/benchmarks/_schema_helpers.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Shared helpers for the v1.0 benchmark bundle writers.
+
+Used by ``benchmark_startup.py``, ``benchmark_rsl_rl.py``, and
+``benchmark_skrl.py`` to build schema-v1 ``Versions`` and ``Hardware``
+dataclasses from the benchmark's manual recorders, and to synthesise a
+fallback run_id when the caller did not provide one.
+"""
+
+from __future__ import annotations
+
+import socket
+from datetime import datetime, timezone
+
+from isaaclab.test.benchmark import BaseIsaacLabBenchmark
+from isaaclab.benchmark.schema import GpuDeviceInfo, Hardware, Versions
+
+
+def capture_versions(bm: BaseIsaacLabBenchmark) -> Versions:
+    """Build a :class:`Versions` from the benchmark's ``VersionInfoRecorder``.
+
+    Must be called before :meth:`BaseIsaacLabBenchmark._finalize_impl`, which
+    clears ``_manual_recorders``.
+    """
+    meta = {m.name: m.data for m in bm._manual_recorders["VersionInfo"].get_data().metadata}
+    dev = meta.get("dev", {}) or {}
+    return Versions(
+        isaaclab=meta.get("isaaclab_version", "unknown"),
+        isaacsim=meta.get("isaacsim_version"),
+        kit=meta.get("kit_version"),
+        newton=meta.get("newton_version"),
+        warp=meta.get("warp_version"),
+        mjwarp=meta.get("mujoco_warp_version"),
+        torch=meta.get("torch_version", "unknown"),
+        rsl_rl=meta.get("rsl_rl_version"),
+        skrl=meta.get("skrl_version"),
+        git_commit=dev.get("commit_hash"),
+        git_branch=dev.get("branch"),
+        git_dirty=bool(dev.get("dirty", False)),
+    )
+
+
+def capture_hardware(bm: BaseIsaacLabBenchmark) -> Hardware:
+    """Build a :class:`Hardware` from GPU/CPU/Memory recorders.
+
+    Must be called before :meth:`BaseIsaacLabBenchmark._finalize_impl`, which
+    clears ``_manual_recorders``.
+    """
+    gpu_meta = {m.name: m.data for m in bm._manual_recorders["GPUInfo"].get_data().metadata}
+    cpu_meta = {m.name: m.data for m in bm._manual_recorders["CPUInfo"].get_data().metadata}
+    mem_meta = {m.name: m.data for m in bm._manual_recorders["MemoryInfo"].get_data().metadata}
+    devices_raw = gpu_meta.get("gpu_devices", {}) or {}
+    devices = [
+        GpuDeviceInfo(
+            name=str(d.get("name", "unknown")),
+            mem_gb=float(d.get("total_memory_gb", 0.0) or 0.0),
+            compute_cap=str(d.get("compute_capability", "unknown")),
+        )
+        for d in devices_raw.values()
+    ]
+    return Hardware(
+        hostname=socket.gethostname(),
+        gpu_devices=devices,
+        cpu_name=str(cpu_meta.get("cpu_name", "unknown")),
+        cpu_count=int(cpu_meta.get("physical_cores", 0) or 0),
+        ram_gb=float(mem_meta.get("total_ram_gb", 0.0) or 0.0),
+    )
+
+
+def synth_run_id(framework: str, backend: str, task: str, seed: int) -> str:
+    """Fallback run_id when the caller did not supply ``--run_id``.
+
+    Format: ``<framework>_<backend>_<task>_<YYYYMMDD-HHMMSS>_seed<seed>``,
+    with underscores in ``framework`` replaced by hyphens (so ``rsl_rl``
+    becomes ``rsl-rl``).
+    """
+    stamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
+    fw = framework.replace("_", "-")
+    return f"{fw}_{backend}_{task}_{stamp}_seed{seed}"
diff --git a/scripts/benchmarks/benchmark_rsl_rl.py b/scripts/benchmarks/benchmark_rsl_rl.py
index 54aaf8dfcc77..9f278b9ef948 100644
--- a/scripts/benchmarks/benchmark_rsl_rl.py
+++ b/scripts/benchmarks/benchmark_rsl_rl.py
@@ -8,7 +8,6 @@
 """Launch Isaac Sim Simulator first."""
 
 import argparse
-import contextlib
 import os
 import sys
 import time
@@ -27,6 +26,40 @@
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../.."))
 import scripts.reinforcement_learning.rsl_rl.cli_args as cli_args  # isort: skip
 
+
+def _native_backend_matches(raw_cfg, requested: str) -> bool:
+    """Return ``True`` iff ``raw_cfg.sim.physics`` matches the requested backend.
+
+    Returns ``False`` for sim-level :class:`PresetCfg` wrappers: presets carry
+    multiple backends and the preset system handles selection downstream.
+    """
+    sim = getattr(raw_cfg, "sim", None)
+    if sim is None:
+        return False
+    from isaaclab_tasks.utils.hydra import PresetCfg
+
+    if isinstance(sim, PresetCfg):
+        return False  # preset system handles it; presets_available is the source of truth
+    physics = getattr(sim, "physics", None)
+    # SimulationCfg.physics defaults to None which means PhysxCfg().
+    if physics is None:
+        return requested == "physx"
+    from isaaclab_newton.physics import NewtonCfg
+    from isaaclab_physx.physics import PhysxCfg
+
+    try:
+        from isaaclab_ovphysx.physics import OvPhysxCfg
+    except ImportError:
+        OvPhysxCfg = None
+    if isinstance(physics, PhysxCfg):
+        return requested == "physx"
+    if isinstance(physics, NewtonCfg):
+        return requested == "newton"
+    if OvPhysxCfg is not None and isinstance(physics, OvPhysxCfg):
+        return requested == "ovphysx"
+    return False
+
+
 # add argparse arguments
 parser = argparse.ArgumentParser(description="Train an RL agent with RSL-RL.")
 parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
@@ -65,6 +98,52 @@
 parser.add_argument(
     "--convergence_config", type=str, default="full", help="Config mode for convergence thresholds (default: full)."
 )
+parser.add_argument(
+    "--backend",
+    choices=["physx", "newton"],
+    default=None,
+    help=(
+        "Physics backend to run with. Drives both the bundle tag and "
+        "hydra `presets=<backend>`. Pass an explicit `presets=...` on "
+        "the CLI to override."
+    ),
+)
+parser.add_argument(
+    "--log_dir",
+    type=str,
+    default=None,
+    help=(
+        "Absolute path where the training framework writes its outputs "
+        "(TB events, checkpoints, params). When unset, falls back to "
+        "the default logs/<framework>/<experiment>/<timestamp>/ path. "
+        "Useful for downstream tooling that wants to collect outputs "
+        "into a pre-allocated directory."
+    ),
+)
+parser.add_argument(
+    "--run_id",
+    type=str,
+    default=None,
+    help="Run identity string to embed in the bundle. If omitted, a synthetic run_id is generated.",
+)
+parser.add_argument(
+    "--schema_v1_output",
+    type=str,
+    default=None,
+    help="If set, write a schema-v1 training.json to this path.",
+)
+parser.add_argument(
+    "--ema_alpha",
+    type=float,
+    default=0.05,
+    help="EMA smoothing factor for reward/ep_length (default 0.05, ~20-sample window).",
+)
+parser.add_argument(
+    "--no_series",
+    action="store_true",
+    default=False,
+    help="Omit per-iteration series from training.json (leaves final_raw + final_ema only).",
+)
 add_success_cli_args(parser)
 
 # append RSL-RL cli arguments
@@ -73,14 +152,61 @@
 AppLauncher.add_app_launcher_args(parser)
 args_cli, hydra_args = setup_preset_cli(parser)
 hydra_args = fold_preset_tokens(hydra_args)
-sys.argv = [sys.argv[0]] + hydra_args
 if args_cli.video:
     args_cli.enable_cameras = True
 
+# Map --backend X to hydra presets=X so the physics preset is applied
+# at config-resolve time.  Validate the request first: if the task does
+# not have an X preset, exit fast with a stable stderr prefix the
+# Asgard worker classifier matches on.  An explicit presets=... on
+# the CLI bypasses validation (operator override).
+if args_cli.backend is not None:
+    existing_presets = [a for a in hydra_args if a.startswith("presets=")]
+    if existing_presets:
+        print(f"[WARNING] --backend={args_cli.backend} ignored because {existing_presets[0]} was explicitly passed.")
+    else:
+        from isaaclab_tasks.utils.parse_cfg import load_cfg_from_registry
+        from isaaclab_tasks.utils.presets import has_physics_preset
+
+        try:
+            _raw_cfg = load_cfg_from_registry(args_cli.task, "env_cfg_entry_point")
+        except Exception as exc:  # noqa: BLE001 — fall through to original behaviour
+            print(
+                f"[WARNING] could not load raw cfg for {args_cli.task!r} "
+                f"to validate preset support ({type(exc).__name__}: {exc}); "
+                f"injecting presets={args_cli.backend} unchecked.",
+                file=sys.stderr,
+            )
+            hydra_args = [f"presets={args_cli.backend}"] + hydra_args
+        else:
+            if has_physics_preset(_raw_cfg, args_cli.backend):
+                hydra_args = [f"presets={args_cli.backend}"] + hydra_args
+            elif _native_backend_matches(_raw_cfg, args_cli.backend):
+                print(
+                    f"[INFO] task {args_cli.task!r} has no '{args_cli.backend}' "
+                    f"preset; running on native {args_cli.backend} backend (no "
+                    f"injection).",
+                    file=sys.stderr,
+                )
+                # No injection — hydra_args unchanged.
+            else:
+                sys.stderr.write(
+                    f"[ERROR] preset_unsupported: task {args_cli.task!r} has no "
+                    f"{args_cli.backend!r} preset. Inspect raw_cfg.sim.physics or "
+                    f"re-enumerate {{physx,newton}}_envs.yaml.\n"
+                )
+                sys.exit(2)
+
+# Re-set sys.argv so the --backend coercion above propagates to Hydra.
+sys.argv = [sys.argv[0]] + hydra_args
+
 imports_time_begin = time.perf_counter_ns()
 
+import contextlib
 import importlib.metadata as metadata
-from datetime import datetime
+from datetime import datetime, timezone
+
+_SCRIPT_START_DT = datetime.now(timezone.utc)
 
 import gymnasium as gym
 import numpy as np
@@ -94,10 +220,6 @@
 from isaaclab_rl.rsl_rl import RslRlOnPolicyRunnerCfg, RslRlVecEnvWrapper, handle_deprecated_rsl_rl_cfg
 
 import isaaclab_tasks  # noqa: F401
-
-# PLACEHOLDER: Extension template (do not remove this comment)
-with contextlib.suppress(ImportError):
-    import isaaclab_tasks_experimental  # noqa: F401
 from isaaclab_tasks.utils import get_checkpoint_path, launch_simulation, resolve_task_config
 
 imports_time_end = time.perf_counter_ns()
@@ -105,6 +227,7 @@
 from isaaclab.test.benchmark import BaseIsaacLabBenchmark, BenchmarkMonitor
 from isaaclab.utils.timer import Timer
 
+from scripts.benchmarks._schema_helpers import capture_hardware, capture_versions, synth_run_id
 from scripts.benchmarks.utils import (
     get_backend_type,
     get_preset_string,
@@ -146,6 +269,157 @@
 )
 
 
+def _compute_ema(series: list[float], alpha: float) -> float:
+    """Exponentially weighted moving average over a per-iteration series.
+
+    Returns the final EMA value: ``x_0`` initialised to ``series[0]`` and updated
+    as ``x_t = alpha * y_t + (1 - alpha) * x_{t-1}``. Empty series returns 0.0.
+
+    Args:
+        series: Per-iteration scalar values (reward or episode length).
+        alpha: Smoothing factor in [0, 1]. Smaller values give more smoothing.
+
+    Returns:
+        Final EMA value after walking the full series.
+    """
+    if not series:
+        return 0.0
+    ema = float(series[0])
+    for y in series[1:]:
+        ema = alpha * float(y) + (1.0 - alpha) * ema
+    return ema
+
+
+def _find_measurement(measurements, name: str) -> float | None:
+    """Return the value of the first SingleMeasurement with matching ``name``."""
+    for meas in measurements:
+        if meas.name == name:
+            return float(meas.value)
+    return None
+
+
+def _capture_resources(bm: BaseIsaacLabBenchmark):
+    """Build a schema-v1 :class:`Resources` dataclass from GPU/CPU/Memory recorders."""
+    from isaaclab.benchmark.schema import MeanStd, MeanStdPeak, Resources
+
+    gpu_m = bm._manual_recorders["GPUInfo"].get_data().measurements
+    cpu_m = bm._manual_recorders["CPUInfo"].get_data().measurements
+    mem_m = bm._manual_recorders["MemoryInfo"].get_data().measurements
+
+    gpu_util_mean = _find_measurement(gpu_m, "GPU Utilization") or 0.0
+    gpu_util_std = _find_measurement(gpu_m, "GPU Utilization std") or 0.0
+    gpu_mem_mean = _find_measurement(gpu_m, "GPU Memory Used") or 0.0
+    gpu_mem_std = _find_measurement(gpu_m, "GPU Memory Used std") or 0.0
+    gpu_mem_peak = _find_measurement(gpu_m, "GPU Memory Used peak") or 0.0
+    cpu_util_mean = _find_measurement(cpu_m, "CPU Utilization") or 0.0
+    cpu_util_std = _find_measurement(cpu_m, "CPU Utilization std") or 0.0
+    ram_mean = _find_measurement(mem_m, "System Memory RSS") or 0.0
+    ram_std = _find_measurement(mem_m, "System Memory RSS std") or 0.0
+    ram_peak = _find_measurement(mem_m, "System Memory RSS peak") or 0.0
+
+    return Resources(
+        gpu_util_pct=MeanStd(mean=gpu_util_mean, std=gpu_util_std),
+        gpu_mem_gb=MeanStdPeak(mean=gpu_mem_mean, std=gpu_mem_std, peak=gpu_mem_peak),
+        cpu_util_pct=MeanStd(mean=cpu_util_mean, std=cpu_util_std),
+        ram_gb=MeanStdPeak(mean=ram_mean, std=ram_std, peak=ram_peak),
+    )
+
+
+def _build_training_bundle(
+    log_data,
+    agent_cfg,
+    env,
+    args,
+    framework: str,
+    versions,
+    hardware,
+    resources,
+    run_start_dt: datetime,
+    run_end_dt: datetime,
+    status: str,
+    app_launch_s: float,
+    env_creation_s: float,
+    first_step_s: float,
+):
+    """Build a schema-v1 :class:`TrainingBundle` from tensorboard-parsed training data."""
+    import numpy as np
+
+    from isaaclab.benchmark.schema import (
+        Learning,
+        LearningCurve,
+        MeanStd,
+        RunIdentity,
+        Runtime,
+        StartupPhaseTimes,
+        TrainingBundle,
+    )
+
+    reward_series = [float(x) for x in log_data.get("Train/mean_reward", [])]
+    ep_len_series = [float(x) for x in log_data.get("Train/mean_episode_length", [])]
+
+    num_envs = env.unwrapped.num_envs
+    steps_per_iter = agent_cfg.num_steps_per_env
+    total_fps = list(log_data.get("Perf/total_fps", []) or [])
+    iter_times = [num_envs * steps_per_iter / fps if fps > 0 else 0.0 for fps in total_fps]
+
+    def _ms(xs):
+        return MeanStd(
+            mean=float(np.mean(xs)) if xs else 0.0,
+            std=float(np.std(xs)) if xs else 0.0,
+        )
+
+    env_steps_per_s_series = [num_envs * steps_per_iter / t if t > 0 else 0.0 for t in iter_times]
+    iters_per_s_series = [1.0 / t if t > 0 else 0.0 for t in iter_times]
+
+    backend = args.backend or "physx"
+    run_id = args.run_id or synth_run_id(framework, backend, args.task, args.seed)
+
+    return TrainingBundle(
+        run=RunIdentity(
+            run_id=run_id,
+            framework=framework,
+            backend=backend,
+            task=args.task,
+            seed=args.seed,
+            num_envs=num_envs,
+            max_iterations=agent_cfg.max_iterations,
+            start_time_utc=run_start_dt.isoformat().replace("+00:00", "Z"),
+            end_time_utc=run_end_dt.isoformat().replace("+00:00", "Z"),
+            duration_s=(run_end_dt - run_start_dt).total_seconds(),
+            status=status,
+        ),
+        versions=versions,
+        hardware=hardware,
+        runtime=Runtime(
+            startup_phase_times_s=StartupPhaseTimes(
+                app_launch=app_launch_s,
+                env_creation=env_creation_s,
+                first_step=first_step_s,
+            ),
+            iterations_completed=len(iter_times),
+            total_wall_time_s=sum(iter_times),
+            steps_per_iteration=steps_per_iter,
+            iteration_time_s=_ms(iter_times),
+            env_steps_per_s=_ms(env_steps_per_s_series),
+            iterations_per_s=_ms(iters_per_s_series),
+        ),
+        resources=resources,
+        learning=Learning(
+            ema_alpha=args.ema_alpha,
+            reward=LearningCurve(
+                final_raw=reward_series[-1] if reward_series else 0.0,
+                final_ema=_compute_ema(reward_series, args.ema_alpha),
+                series_per_iter=None if args.no_series else reward_series,
+            ),
+            ep_length=LearningCurve(
+                final_raw=ep_len_series[-1] if ep_len_series else 0.0,
+                final_ema=_compute_ema(ep_len_series, args.ema_alpha),
+                series_per_iter=None if args.no_series else ep_len_series,
+            ),
+        ),
+    )
+
+
 def main(
     env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg,
     agent_cfg: RslRlOnPolicyRunnerCfg,
@@ -191,15 +465,21 @@ def main(
         agent_cfg.seed = seed
         world_size = int(os.getenv("WORLD_SIZE", 1))
 
-    # specify directory for logging experiments
-    log_root_path = os.path.join("logs", "rsl_rl", agent_cfg.experiment_name)
-    log_root_path = os.path.abspath(log_root_path)
-    print(f"[INFO] Logging experiment in directory: {log_root_path}")
-    # specify directory for logging runs: {time-stamp}_{run_name}
-    log_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    if agent_cfg.run_name:
-        log_dir += f"_{agent_cfg.run_name}"
-    log_dir = os.path.join(log_root_path, log_dir)
+    if args_cli.log_dir is not None:
+        # Explicit override: write straight into the given dir.
+        log_dir = os.path.abspath(args_cli.log_dir)
+        log_root_path = os.path.dirname(log_dir)
+        os.makedirs(log_dir, exist_ok=True)
+        print(f"[INFO] Logging experiment in directory: {log_dir}")
+    else:
+        # Default: auto-generate logs/<framework>/<experiment>/<timestamp>/
+        log_root_path = os.path.join("logs", "rsl_rl", agent_cfg.experiment_name)
+        log_root_path = os.path.abspath(log_root_path)
+        print(f"[INFO] Logging experiment in directory: {log_root_path}")
+        log_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        if agent_cfg.run_name:
+            log_dir += f"_{agent_cfg.run_name}"
+        log_dir = os.path.join(log_root_path, log_dir)
 
     # max iterations for training
     if args_cli.max_iterations:
@@ -301,8 +581,46 @@ def main(
         tracker = get_success_tracker(args_cli, early_stop_ctx.tracker, log_data)
         log_success(benchmark, tracker, framework_iteration_count=early_stop_ctx.framework_iteration_count)
 
+        # Capture v1 state before _finalize_impl nulls out _manual_recorders.
+        versions_v1 = None
+        hardware_v1 = None
+        resources_v1 = None
+        if args_cli.schema_v1_output is not None:
+            versions_v1 = capture_versions(benchmark)
+            hardware_v1 = capture_hardware(benchmark)
+            resources_v1 = _capture_resources(benchmark)
+
         benchmark._finalize_impl()
 
+        if args_cli.schema_v1_output is not None:
+            from isaaclab.benchmark.schema import write_bundle_file
+
+            # Proxy for first-step time: the first iteration's collection+learning time.
+            # Pending a dedicated first-step timer in runner.learn().
+            first_step_s = 0.0
+            with contextlib.suppress(IndexError, KeyError, ValueError):
+                first_step_s = float(rl_training_times["Collection Time"][0]) + float(
+                    rl_training_times["Learning Time"][0]
+                )
+
+            bundle = _build_training_bundle(
+                log_data=log_data,
+                agent_cfg=agent_cfg,
+                env=env,
+                args=args_cli,
+                framework="rsl_rl",
+                versions=versions_v1,
+                hardware=hardware_v1,
+                resources=resources_v1,
+                run_start_dt=_SCRIPT_START_DT,
+                run_end_dt=datetime.now(timezone.utc),
+                status="completed",
+                app_launch_s=(app_start_time_end - app_start_time_begin) / 1e9,
+                env_creation_s=(task_startup_time_end - task_startup_time_begin) / 1e9,
+                first_step_s=first_step_s,
+            )
+            write_bundle_file(bundle, args_cli.schema_v1_output)
+
     # close the simulator
     env.close()
 
diff --git a/scripts/benchmarks/benchmark_skrl.py b/scripts/benchmarks/benchmark_skrl.py
new file mode 100644
index 000000000000..0a3ad8fc6ac9
--- /dev/null
+++ b/scripts/benchmarks/benchmark_skrl.py
@@ -0,0 +1,562 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Script to benchmark RL agent with SKRL.
+
+Mirrors :mod:`scripts.benchmarks.benchmark_rsl_rl` but uses SKRL's PPO Runner.
+The v1.0 ``training.json`` output is identical in shape; only the
+``framework`` field switches to ``"skrl"``.
+"""
+
+"""Launch Isaac Sim Simulator first."""
+
+import argparse
+import os
+import sys
+import time
+
+from isaaclab.app import AppLauncher
+
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../.."))
+
+
+def _native_backend_matches(raw_cfg, requested: str) -> bool:
+    """Return ``True`` iff ``raw_cfg.sim.physics`` matches the requested backend.
+
+    Returns ``False`` for sim-level :class:`PresetCfg` wrappers: presets carry
+    multiple backends and the preset system handles selection downstream.
+    """
+    sim = getattr(raw_cfg, "sim", None)
+    if sim is None:
+        return False
+    from isaaclab_tasks.utils.hydra import PresetCfg
+
+    if isinstance(sim, PresetCfg):
+        return False  # preset system handles it; presets_available is the source of truth
+    physics = getattr(sim, "physics", None)
+    # SimulationCfg.physics defaults to None which means PhysxCfg().
+    if physics is None:
+        return requested == "physx"
+    from isaaclab_newton.physics import NewtonCfg
+    from isaaclab_physx.physics import PhysxCfg
+
+    try:
+        from isaaclab_ovphysx.physics import OvPhysxCfg
+    except ImportError:
+        OvPhysxCfg = None
+    if isinstance(physics, PhysxCfg):
+        return requested == "physx"
+    if isinstance(physics, NewtonCfg):
+        return requested == "newton"
+    if OvPhysxCfg is not None and isinstance(physics, OvPhysxCfg):
+        return requested == "ovphysx"
+    return False
+
+
+# -- CLI arguments -----------------------------------------------------------
+
+parser = argparse.ArgumentParser(description="Benchmark an RL agent with SKRL.")
+parser.add_argument("--num_envs", type=int, default=4096, help="Number of environments to simulate.")
+parser.add_argument("--task", type=str, default=None, help="Name of the task.")
+parser.add_argument("--seed", type=int, default=42, help="Seed used for the environment")
+parser.add_argument("--max_iterations", type=int, default=10, help="RL policy training iterations.")
+parser.add_argument(
+    "--algorithm",
+    type=str,
+    default="PPO",
+    choices=["AMP", "PPO", "IPPO", "MAPPO"],
+    help="The RL algorithm used for training the SKRL agent.",
+)
+parser.add_argument(
+    "--ml_framework",
+    type=str,
+    default="torch",
+    choices=["torch", "jax", "jax-numpy"],
+    help="The ML framework used for training the SKRL agent.",
+)
+parser.add_argument(
+    "--benchmark_backend",
+    type=str,
+    default="omniperf",
+    choices=[
+        "json",
+        "osmo",
+        "omniperf",
+        "summary",
+        "LocalLogMetrics",
+        "JSONFileMetrics",
+        "OsmoKPIFile",
+        "OmniPerfKPIFile",
+    ],
+    help="Benchmarking backend options, defaults omniperf",
+)
+parser.add_argument("--output_path", type=str, default=".", help="Path to output benchmark results.")
+parser.add_argument(
+    "--backend",
+    choices=["physx", "newton"],
+    default=None,
+    help=(
+        "Physics backend to run with. Drives both the bundle tag and "
+        "hydra `presets=<backend>`. Pass an explicit `presets=...` on "
+        "the CLI to override."
+    ),
+)
+parser.add_argument(
+    "--log_dir",
+    type=str,
+    default=None,
+    help=(
+        "Absolute path where the training framework writes its outputs "
+        "(TB events, checkpoints, params). When unset, falls back to "
+        "the default logs/<framework>/<experiment>/<timestamp>/ path. "
+        "Useful for downstream tooling that wants to collect outputs "
+        "into a pre-allocated directory."
+    ),
+)
+parser.add_argument(
+    "--run_id",
+    type=str,
+    default=None,
+    help="Run identity string to embed in the bundle. If omitted, a synthetic run_id is generated.",
+)
+parser.add_argument(
+    "--schema_v1_output",
+    type=str,
+    default=None,
+    help="If set, write a schema-v1 training.json to this path.",
+)
+parser.add_argument(
+    "--ema_alpha",
+    type=float,
+    default=0.05,
+    help="EMA smoothing factor for reward/ep_length (default 0.05, ~20-sample window).",
+)
+parser.add_argument(
+    "--no_series",
+    action="store_true",
+    default=False,
+    help="Omit per-iteration series from training.json (leaves final_raw + final_ema only).",
+)
+
+AppLauncher.add_app_launcher_args(parser)
+args_cli, hydra_args = parser.parse_known_args()
+
+# Map --backend X to hydra presets=X so the physics preset is applied
+# at config-resolve time.  Validate the request first: if the task does
+# not have an X preset, exit fast with a stable stderr prefix the
+# Asgard worker classifier matches on.  An explicit presets=... on
+# the CLI bypasses validation (operator override).
+if args_cli.backend is not None:
+    existing_presets = [a for a in hydra_args if a.startswith("presets=")]
+    if existing_presets:
+        print(f"[WARNING] --backend={args_cli.backend} ignored because {existing_presets[0]} was explicitly passed.")
+    else:
+        from isaaclab_tasks.utils.parse_cfg import load_cfg_from_registry
+        from isaaclab_tasks.utils.presets import has_physics_preset
+
+        try:
+            _raw_cfg = load_cfg_from_registry(args_cli.task, "env_cfg_entry_point")
+        except Exception as exc:  # noqa: BLE001 — fall through to original behaviour
+            print(
+                f"[WARNING] could not load raw cfg for {args_cli.task!r} "
+                f"to validate preset support ({type(exc).__name__}: {exc}); "
+                f"injecting presets={args_cli.backend} unchecked.",
+                file=sys.stderr,
+            )
+            hydra_args = [f"presets={args_cli.backend}"] + hydra_args
+        else:
+            if has_physics_preset(_raw_cfg, args_cli.backend):
+                hydra_args = [f"presets={args_cli.backend}"] + hydra_args
+            elif _native_backend_matches(_raw_cfg, args_cli.backend):
+                print(
+                    f"[INFO] task {args_cli.task!r} has no '{args_cli.backend}' "
+                    f"preset; running on native {args_cli.backend} backend (no "
+                    f"injection).",
+                    file=sys.stderr,
+                )
+                # No injection — hydra_args unchanged.
+            else:
+                sys.stderr.write(
+                    f"[ERROR] preset_unsupported: task {args_cli.task!r} has no "
+                    f"{args_cli.backend!r} preset. Inspect raw_cfg.sim.physics or "
+                    f"re-enumerate {{physx,newton}}_envs.yaml.\n"
+                )
+                sys.exit(2)
+
+# clear out sys.argv for Hydra
+sys.argv = [sys.argv[0]] + hydra_args
+
+imports_time_begin = time.perf_counter_ns()
+
+from datetime import datetime, timezone
+
+_SCRIPT_START_DT = datetime.now(timezone.utc)
+
+import gymnasium as gym
+import numpy as np
+import torch
+
+from isaaclab.envs import DirectMARLEnvCfg, DirectRLEnvCfg, ManagerBasedRLEnvCfg
+from isaaclab.utils.io import dump_yaml
+
+from isaaclab_rl.skrl import SkrlVecEnvWrapper
+
+import isaaclab_tasks  # noqa: F401
+from isaaclab_tasks.utils import launch_simulation, resolve_task_config
+
+imports_time_end = time.perf_counter_ns()
+
+from isaaclab.test.benchmark import BaseIsaacLabBenchmark, BenchmarkMonitor
+from isaaclab.utils.timer import Timer
+
+from scripts.benchmarks._schema_helpers import capture_hardware, capture_versions, synth_run_id
+from scripts.benchmarks.utils import (
+    get_backend_type,
+    get_preset_string,
+    log_app_start_time,
+    log_python_imports_time,
+    log_rl_policy_episode_lengths,
+    log_rl_policy_rewards,
+    log_runtime_step_times,
+    log_scene_creation_time,
+    log_simulation_start_time,
+    log_task_start_time,
+    log_total_start_time,
+)
+
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.backends.cudnn.deterministic = False
+torch.backends.cudnn.benchmark = False
+
+# Resolve SKRL agent entry point (matches scripts/reinforcement_learning/skrl/train.py).
+# For multi-agent (DirectMARLEnv) tasks, plain PPO can't be used — its observation
+# preprocessor and policy assume a single Tensor obs, but DirectMARLEnv emits a
+# per-agent dict. Auto-promote the default ``ppo`` algorithm to ``ippo`` so the
+# task gets the right multi-agent variant; explicit ``--algorithm`` overrides
+# (e.g. user passes ``mappo``) are honoured as-is.
+_algorithm = args_cli.algorithm.lower()
+if _algorithm == "ppo":
+    try:
+        from isaaclab.envs import DirectMARLEnvCfg as _DirectMARLEnvCfg
+
+        from isaaclab_tasks.utils.parse_cfg import load_cfg_from_registry as _peek_cfg
+
+        _peek = _peek_cfg(args_cli.task, "env_cfg_entry_point")
+        if isinstance(_peek, type) and issubclass(_peek, _DirectMARLEnvCfg):
+            _is_marl = True
+        else:
+            _is_marl = isinstance(_peek, _DirectMARLEnvCfg)
+    except Exception:  # noqa: BLE001 — best-effort detection; fall through to PPO if peek fails
+        _is_marl = False
+    if _is_marl:
+        print(f"[INFO] {args_cli.task!r} is a multi-agent task; promoting --algorithm ppo -> ippo.", file=sys.stderr)
+        _algorithm = "ippo"
+_agent_cfg_entry_point = "skrl_cfg_entry_point" if _algorithm == "ppo" else f"skrl_{_algorithm}_cfg_entry_point"
+
+backend_type = get_backend_type(args_cli.benchmark_backend)
+benchmark = BaseIsaacLabBenchmark(
+    benchmark_name="benchmark_skrl_train",
+    backend_type=backend_type,
+    output_path=args_cli.output_path,
+    use_recorders=True,
+    frametime_recorders=backend_type in ("summary", "omniperf"),
+    output_prefix=f"benchmark_skrl_train_{args_cli.task}",
+    workflow_metadata={
+        "metadata": [
+            {"name": "task", "data": args_cli.task},
+            {"name": "seed", "data": args_cli.seed},
+            {"name": "num_envs", "data": args_cli.num_envs},
+            {"name": "max_iterations", "data": args_cli.max_iterations},
+            {"name": "algorithm", "data": args_cli.algorithm},
+            {"name": "presets", "data": get_preset_string(hydra_args)},
+        ]
+    },
+)
+
+
+def _compute_ema(series: list[float], alpha: float) -> float:
+    """Exponentially weighted moving average over a per-iteration series.
+
+    Args:
+        series: Per-iteration scalar values.
+        alpha: Smoothing factor in [0, 1]; smaller values give more smoothing.
+
+    Returns:
+        Final EMA value; 0.0 for an empty series.
+    """
+    if not series:
+        return 0.0
+    ema = float(series[0])
+    for y in series[1:]:
+        ema = alpha * float(y) + (1.0 - alpha) * ema
+    return ema
+
+
+def _find_measurement(measurements, name: str) -> float | None:
+    """Return the value of the first SingleMeasurement with matching ``name``."""
+    for meas in measurements:
+        if meas.name == name:
+            return float(meas.value)
+    return None
+
+
+def _capture_resources(bm: BaseIsaacLabBenchmark):
+    """Build a schema-v1 :class:`Resources` from GPU/CPU/Memory recorders."""
+    from isaaclab.benchmark.schema import MeanStd, MeanStdPeak, Resources
+
+    gpu_m = bm._manual_recorders["GPUInfo"].get_data().measurements
+    cpu_m = bm._manual_recorders["CPUInfo"].get_data().measurements
+    mem_m = bm._manual_recorders["MemoryInfo"].get_data().measurements
+
+    gpu_util_mean = _find_measurement(gpu_m, "GPU Utilization") or 0.0
+    gpu_util_std = _find_measurement(gpu_m, "GPU Utilization std") or 0.0
+    gpu_mem_mean = _find_measurement(gpu_m, "GPU Memory Used") or 0.0
+    gpu_mem_std = _find_measurement(gpu_m, "GPU Memory Used std") or 0.0
+    gpu_mem_peak = _find_measurement(gpu_m, "GPU Memory Used peak") or 0.0
+    cpu_util_mean = _find_measurement(cpu_m, "CPU Utilization") or 0.0
+    cpu_util_std = _find_measurement(cpu_m, "CPU Utilization std") or 0.0
+    ram_mean = _find_measurement(mem_m, "System Memory RSS") or 0.0
+    ram_std = _find_measurement(mem_m, "System Memory RSS std") or 0.0
+    ram_peak = _find_measurement(mem_m, "System Memory RSS peak") or 0.0
+
+    return Resources(
+        gpu_util_pct=MeanStd(mean=gpu_util_mean, std=gpu_util_std),
+        gpu_mem_gb=MeanStdPeak(mean=gpu_mem_mean, std=gpu_mem_std, peak=gpu_mem_peak),
+        cpu_util_pct=MeanStd(mean=cpu_util_mean, std=cpu_util_std),
+        ram_gb=MeanStdPeak(mean=ram_mean, std=ram_std, peak=ram_peak),
+    )
+
+
+def _build_training_bundle(
+    reward_series: list[float],
+    ep_len_series: list[float],
+    iter_times_s: list[float],
+    num_envs: int,
+    steps_per_iter: int,
+    args,
+    versions,
+    hardware,
+    resources,
+    run_start_dt: datetime,
+    run_end_dt: datetime,
+    status: str,
+    app_launch_s: float,
+    env_creation_s: float,
+    first_step_s: float,
+):
+    """Build a schema-v1 :class:`TrainingBundle` for an SKRL run."""
+    from isaaclab.benchmark.schema import (
+        Learning,
+        LearningCurve,
+        MeanStd,
+        RunIdentity,
+        Runtime,
+        StartupPhaseTimes,
+        TrainingBundle,
+    )
+
+    def _ms(xs):
+        return MeanStd(
+            mean=float(np.mean(xs)) if xs else 0.0,
+            std=float(np.std(xs)) if xs else 0.0,
+        )
+
+    env_steps_per_s_series = [num_envs * steps_per_iter / t if t > 0 else 0.0 for t in iter_times_s]
+    iters_per_s_series = [1.0 / t if t > 0 else 0.0 for t in iter_times_s]
+
+    backend = args.backend or "physx"
+    run_id = args.run_id or synth_run_id("skrl", backend, args.task, args.seed)
+
+    return TrainingBundle(
+        run=RunIdentity(
+            run_id=run_id,
+            framework="skrl",
+            backend=backend,
+            task=args.task,
+            seed=args.seed,
+            num_envs=num_envs,
+            max_iterations=args.max_iterations,
+            start_time_utc=run_start_dt.isoformat().replace("+00:00", "Z"),
+            end_time_utc=run_end_dt.isoformat().replace("+00:00", "Z"),
+            duration_s=(run_end_dt - run_start_dt).total_seconds(),
+            status=status,
+        ),
+        versions=versions,
+        hardware=hardware,
+        runtime=Runtime(
+            startup_phase_times_s=StartupPhaseTimes(
+                app_launch=app_launch_s,
+                env_creation=env_creation_s,
+                first_step=first_step_s,
+            ),
+            iterations_completed=len(iter_times_s),
+            total_wall_time_s=sum(iter_times_s),
+            steps_per_iteration=steps_per_iter,
+            iteration_time_s=_ms(iter_times_s),
+            env_steps_per_s=_ms(env_steps_per_s_series),
+            iterations_per_s=_ms(iters_per_s_series),
+        ),
+        resources=resources,
+        learning=Learning(
+            ema_alpha=args.ema_alpha,
+            reward=LearningCurve(
+                final_raw=reward_series[-1] if reward_series else 0.0,
+                final_ema=_compute_ema(reward_series, args.ema_alpha),
+                series_per_iter=None if args.no_series else reward_series,
+            ),
+            ep_length=LearningCurve(
+                final_raw=ep_len_series[-1] if ep_len_series else 0.0,
+                final_ema=_compute_ema(ep_len_series, args.ema_alpha),
+                series_per_iter=None if args.no_series else ep_len_series,
+            ),
+        ),
+    )
+
+
+def main(
+    env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg,
+    agent_cfg: dict,
+    app_start_time_begin: int,
+    app_start_time_end: int,
+):
+    """Train an SKRL agent and emit a v1 schema bundle on success."""
+    from skrl.utils.runner.torch import Runner
+
+    # Override configuration with non-hydra CLI arguments.
+    env_cfg.scene.num_envs = args_cli.num_envs if args_cli.num_envs is not None else env_cfg.scene.num_envs
+    env_cfg.sim.device = args_cli.device if args_cli.device is not None else env_cfg.sim.device
+    agent_cfg["trainer"]["close_environment_at_exit"] = False
+
+    # Derive total timesteps from max_iterations (same formula as train.py).
+    rollouts = int(agent_cfg["agent"]["rollouts"])
+    agent_cfg["trainer"]["timesteps"] = args_cli.max_iterations * rollouts
+
+    agent_cfg["seed"] = args_cli.seed
+    env_cfg.seed = args_cli.seed
+
+    if args_cli.log_dir is not None:
+        # Decompose so both `directory` and `experiment_name` are non-empty —
+        # SKRL's BaseAgent synthesizes a timestamp+classname subdir when
+        # `experiment_name` is falsy. Splitting <log_dir> into dirname/basename
+        # makes ``os.path.join(directory, experiment_name)`` recompose to
+        # <log_dir> exactly.
+        log_dir = os.path.abspath(args_cli.log_dir)
+        agent_cfg["agent"]["experiment"]["directory"] = os.path.dirname(log_dir) or "."
+        agent_cfg["agent"]["experiment"]["experiment_name"] = os.path.basename(log_dir)
+        os.makedirs(log_dir, exist_ok=True)
+    else:
+        log_root_path = os.path.join("logs", "skrl", agent_cfg["agent"]["experiment"]["directory"])
+        log_root_path = os.path.abspath(log_root_path)
+        log_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_{_algorithm}_{args_cli.ml_framework}"
+        if agent_cfg["agent"]["experiment"]["experiment_name"]:
+            log_dir += f"_{agent_cfg['agent']['experiment']['experiment_name']}"
+        agent_cfg["agent"]["experiment"]["directory"] = log_root_path
+        agent_cfg["agent"]["experiment"]["experiment_name"] = log_dir
+        log_dir = os.path.join(log_root_path, log_dir)
+    if isinstance(env_cfg, ManagerBasedRLEnvCfg):
+        env_cfg.log_dir = log_dir
+
+    dump_yaml(os.path.join(log_dir, "params", "env.yaml"), env_cfg)
+    dump_yaml(os.path.join(log_dir, "params", "agent.yaml"), agent_cfg)
+
+    task_startup_time_begin = time.perf_counter_ns()
+    env = gym.make(args_cli.task, cfg=env_cfg)
+    env = SkrlVecEnvWrapper(env, ml_framework=args_cli.ml_framework)
+    task_startup_time_end = time.perf_counter_ns()
+
+    from scripts.benchmarks.skrl_benchmark_trainer import BenchmarkTrainer
+
+    class _BenchmarkRunner(Runner):
+        """Runner variant that builds a BenchmarkTrainer instead of a stock SequentialTrainer.
+
+        Using a Runner subclass (rather than swapping ``Runner._trainer`` after
+        construction) ensures SKRL's ``agent.init()`` — which creates a
+        ``SummaryWriter`` — fires exactly once. Swapping after-the-fact would
+        call ``agent.init()`` twice and leave an orphaned TB events file in
+        the log dir.
+        """
+
+        def _generate_trainer(self, env, cfg, agent):
+            # Mirror stock Runner._generate_trainer: pop 'class', pass cfg["trainer"].
+            cfg["trainer"].pop("class", None)
+            return BenchmarkTrainer(env=env, agents=agent, cfg=cfg["trainer"])
+
+    runner = _BenchmarkRunner(env, agent_cfg)
+    benchmark_trainer = runner._trainer
+
+    with BenchmarkMonitor(benchmark, interval=1.0):
+        runner.run()
+
+    # Final recorder update after training completes.
+    benchmark.update_manual_recorders()
+
+    iter_times_s = benchmark_trainer.iter_times_s
+    reward_series = benchmark_trainer.iter_rewards
+    ep_len_series = benchmark_trainer.iter_ep_lengths
+    per_iter_s = (sum(iter_times_s) / len(iter_times_s)) if iter_times_s else 0.0
+
+    rl_training_times = {
+        "Collection Time": iter_times_s,
+        "Learning Time": [0.0] * len(iter_times_s),
+        "Total FPS": [(args_cli.num_envs * rollouts / t) if t > 0 else 0.0 for t in iter_times_s],
+    }
+
+    log_app_start_time(benchmark, (app_start_time_end - app_start_time_begin) / 1e6)
+    log_python_imports_time(benchmark, (imports_time_end - imports_time_begin) / 1e6)
+    log_task_start_time(benchmark, (task_startup_time_end - task_startup_time_begin) / 1e6)
+    log_scene_creation_time(benchmark, Timer.get_timer_info("scene_creation") * 1000)
+    log_simulation_start_time(benchmark, Timer.get_timer_info("simulation_start") * 1000)
+    log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
+    if iter_times_s:
+        log_runtime_step_times(benchmark, rl_training_times, compute_stats=True)
+    if reward_series:
+        log_rl_policy_rewards(benchmark, reward_series)
+    if ep_len_series:
+        log_rl_policy_episode_lengths(benchmark, ep_len_series)
+
+    # Capture v1 state before _finalize_impl clears the recorders.
+    versions_v1 = None
+    hardware_v1 = None
+    resources_v1 = None
+    if args_cli.schema_v1_output is not None:
+        versions_v1 = capture_versions(benchmark)
+        hardware_v1 = capture_hardware(benchmark)
+        resources_v1 = _capture_resources(benchmark)
+
+    benchmark._finalize_impl()
+
+    if args_cli.schema_v1_output is not None:
+        from isaaclab.benchmark.schema import write_bundle_file
+
+        bundle = _build_training_bundle(
+            reward_series=reward_series,
+            ep_len_series=ep_len_series,
+            iter_times_s=iter_times_s,
+            num_envs=env.unwrapped.num_envs,
+            steps_per_iter=rollouts,
+            args=args_cli,
+            versions=versions_v1,
+            hardware=hardware_v1,
+            resources=resources_v1,
+            run_start_dt=_SCRIPT_START_DT,
+            run_end_dt=datetime.now(timezone.utc),
+            status="completed",
+            app_launch_s=(app_start_time_end - app_start_time_begin) / 1e9,
+            env_creation_s=(task_startup_time_end - task_startup_time_begin) / 1e9,
+            first_step_s=per_iter_s,
+        )
+        write_bundle_file(bundle, args_cli.schema_v1_output)
+
+    env.close()
+
+
+if __name__ == "__main__":
+    env_cfg, agent_cfg = resolve_task_config(args_cli.task, _agent_cfg_entry_point)
+
+    app_start_time_begin = time.perf_counter_ns()
+    with launch_simulation(env_cfg, args_cli):
+        app_start_time_end = time.perf_counter_ns()
+        main(env_cfg, agent_cfg, app_start_time_begin, app_start_time_end)
diff --git a/scripts/benchmarks/benchmark_startup.py b/scripts/benchmarks/benchmark_startup.py
index 93d92257ca11..b0742fcff6e5 100644
--- a/scripts/benchmarks/benchmark_startup.py
+++ b/scripts/benchmarks/benchmark_startup.py
@@ -16,11 +16,16 @@
 import os
 import sys
 import time
+from datetime import datetime, timezone
 
 from isaaclab.app import AppLauncher
 
 from isaaclab_tasks.utils import fold_preset_tokens, setup_preset_cli
 
+# Wall-clock start of the entire script, captured as early as possible so the
+# startup bundle can report a total duration that covers all phases.
+_SCRIPT_START_DT = datetime.now(timezone.utc)
+
 # -- CLI arguments -----------------------------------------------------------
 
 parser = argparse.ArgumentParser(description="Profile IsaacLab startup phases.")
@@ -56,6 +61,24 @@
     default=None,
     help="Path to YAML file with per-phase function whitelist patterns. Overrides --top_n for listed phases.",
 )
+parser.add_argument(
+    "--schema_v1_output",
+    type=str,
+    default=None,
+    help="If set, write a schema-v1 startup.json to this path.",
+)
+parser.add_argument(
+    "--backend",
+    choices=["physx", "newton"],
+    default=None,
+    help="Physics backend tag recorded in the bundle. Defaults to 'physx' if omitted.",
+)
+parser.add_argument(
+    "--run_id",
+    type=str,
+    default=None,
+    help="Run identity string to embed in the bundle. If omitted, a synthetic run_id is generated.",
+)
 
 # append AppLauncher cli args (provides --device, --headless, etc.)
 AppLauncher.add_app_launcher_args(parser)
@@ -68,6 +91,7 @@
 from isaaclab.test.benchmark import BaseIsaacLabBenchmark, SingleMeasurement
 from isaaclab.utils.timer import Timer, TimerError
 
+from scripts.benchmarks._schema_helpers import capture_hardware, capture_versions, synth_run_id
 from scripts.benchmarks.utils import (
     get_backend_type,
     get_preset_string,
@@ -81,13 +105,14 @@
 imports_profile.enable()
 
 import gymnasium as gym  # noqa: E402
-import numpy as np  # noqa: E402
 import torch  # noqa: E402
 
 from isaaclab.envs import DirectMARLEnvCfg, DirectRLEnvCfg, ManagerBasedRLEnvCfg  # noqa: E402
 
 from isaaclab_tasks.utils import launch_simulation, resolve_task_config  # noqa: E402
 
+from scripts.benchmarks._action_sampling import sample_random_actions  # noqa: E402
+
 imports_profile.disable()
 
 if torch.cuda.is_available() and torch.cuda.is_initialized():
@@ -185,6 +210,86 @@
 )
 
 
+# -- Schema v1 helpers ------------------------------------------------------
+
+
+def _build_startup_bundle(
+    phases_data: dict,
+    run_start_dt: datetime,
+    run_end_dt: datetime,
+    status: str,
+    versions,
+    hardware,
+):
+    """Build a schema-v1 StartupBundle from the collected phase data.
+
+    Args:
+        phases_data: The same ``phases`` dict ``main()`` builds for legacy logging.
+        run_start_dt: UTC timestamp when the whole script started.
+        run_end_dt: UTC timestamp when the whole script finished.
+        status: Completion status of the run (``"completed"`` or ``"crashed"``).
+        versions: Pre-captured :class:`Versions` (must be captured before
+            ``benchmark._finalize_impl()`` which clears the recorders).
+        hardware: Pre-captured :class:`Hardware`.
+
+    Returns:
+        A :class:`StartupBundle` ready to be passed to :func:`write_bundle_file`.
+    """
+    from isaaclab.benchmark.schema import (
+        CProfileFunction,
+        StartupBundle,
+        StartupConfig,
+        StartupPhase,
+        StartupRunIdentity,
+    )
+
+    # Startup profiling is framework-agnostic; callers that wrap multiple
+    # framework runs pass the real framework via --run_id. We record "rsl_rl"
+    # as a schema placeholder when invoked standalone (the field is required).
+    framework = "rsl_rl"
+    backend = args_cli.backend or "physx"
+
+    phases_out: dict[str, StartupPhase] = {}
+    for name, data in phases_data.items():
+        top_funcs: list[CProfileFunction] = []
+        for label, tottime_ms, cumtime_ms, ncalls in parse_cprofile_stats(
+            data["profile"], _ISAACLAB_PREFIXES, top_n=args_cli.top_n, whitelist=_WHITELIST.get(name)
+        ):
+            top_funcs.append(
+                CProfileFunction(
+                    name=label,
+                    own_time_s=tottime_ms / 1000.0,
+                    cum_time_s=cumtime_ms / 1000.0,
+                    calls=ncalls,
+                )
+            )
+        phases_out[name] = StartupPhase(
+            total_time_s=data["wall_clock_ms"] / 1000.0,
+            top_functions=top_funcs,
+        )
+
+    seed = args_cli.seed if args_cli.seed is not None else 0
+    run_id = args_cli.run_id or synth_run_id(framework, backend, args_cli.task, seed)
+
+    return StartupBundle(
+        run=StartupRunIdentity(
+            run_id=run_id,
+            framework=framework,
+            backend=backend,
+            task=args_cli.task,
+            seed=seed,
+            start_time_utc=run_start_dt.isoformat().replace("+00:00", "Z"),
+            end_time_utc=run_end_dt.isoformat().replace("+00:00", "Z"),
+            duration_s=(run_end_dt - run_start_dt).total_seconds(),
+            status=status,
+        ),
+        versions=versions,
+        hardware=hardware,
+        phases=phases_out,
+        config=StartupConfig(top_n=args_cli.top_n, whitelist=args_cli.whitelist_config),
+    )
+
+
 # -- Main profiling logic ---------------------------------------------------
 
 
@@ -224,10 +329,10 @@ def main(
         env_creation_time_end = time.perf_counter_ns()
         # -- First step profiled ------------------------------------------------
 
-        # Sample random actions from the action space directly to support
-        # Box, Discrete, MultiDiscrete, and Dict spaces.
-        np_actions = np.stack([env.unwrapped.single_action_space.sample() for _ in range(env.unwrapped.num_envs)])
-        actions = torch.as_tensor(np_actions, dtype=torch.float32, device=env.unwrapped.device)
+        # Sample random actions from the action space(s). Returns a tensor for
+        # single-agent envs and a per-agent dict for multi-agent (DirectMARLEnv)
+        # envs — env.step accepts the matching shape.
+        actions = sample_random_actions(env)
 
         first_step_profile = cProfile.Profile()
         first_step_time_begin = time.perf_counter_ns()
@@ -317,7 +422,7 @@ def main(
                 )
 
             # Log per-function measurements (tottime + cumtime)
-            for label, tottime_ms, cumtime_ms in functions:
+            for label, tottime_ms, cumtime_ms, _ncalls in functions:
                 benchmark.add_measurement(
                     phase_name, measurement=SingleMeasurement(name=label, value=round(tottime_ms, 2), unit="ms")
                 )
@@ -326,9 +431,30 @@ def main(
                     measurement=SingleMeasurement(name=f"{label} (cumtime)", value=round(cumtime_ms, 2), unit="ms"),
                 )
 
-        # Finalize benchmark output
+        # Capture versions/hardware BEFORE finalize, which clears the recorders.
+        versions_v1 = None
+        hardware_v1 = None
+        if args_cli.schema_v1_output is not None:
+            benchmark.update_manual_recorders()
+            versions_v1 = capture_versions(benchmark)
+            hardware_v1 = capture_hardware(benchmark)
+
+        # Finalize benchmark output (nulls out _manual_recorders).
         benchmark.update_manual_recorders()
         benchmark._finalize_impl()
+
+        if args_cli.schema_v1_output is not None:
+            from isaaclab.benchmark.schema import write_bundle_file
+
+            bundle = _build_startup_bundle(
+                phases,
+                _SCRIPT_START_DT,
+                datetime.now(timezone.utc),
+                status="completed",
+                versions=versions_v1,
+                hardware=hardware_v1,
+            )
+            write_bundle_file(bundle, args_cli.schema_v1_output)
     finally:
         if env is not None:
             env.close()
diff --git a/scripts/benchmarks/skrl_benchmark_trainer.py b/scripts/benchmarks/skrl_benchmark_trainer.py
new file mode 100644
index 000000000000..73d58b8ed92b
--- /dev/null
+++ b/scripts/benchmarks/skrl_benchmark_trainer.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""BenchmarkTrainer — SKRL trainer subclass that captures per-iteration metrics.
+
+Mirrors :class:`skrl.trainers.torch.SequentialTrainer`'s training loop and
+records, once per rollout-buffer fill (= one iteration):
+
+* ``iter_times_s``  - wall-clock seconds from the first env step of the
+  rollout to just after ``agent.post_interaction`` of the rollout's final
+  step (i.e. after the PPO update).
+* ``iter_rewards``  - mean reward across all env steps and all parallel
+  envs during the rollout.
+* ``iter_ep_lengths`` - last value of
+  ``agent.tracking_data["Episode / Total timesteps (mean)"]`` observed at
+  iteration end, or ``0.0`` when no episode terminated yet.
+
+These attributes are populated after :meth:`train` returns and are read
+directly by ``benchmark_skrl.py``'s v1 bundle builder — no TB round trip.
+"""
+
+from __future__ import annotations
+
+import inspect
+import time
+
+import torch
+import tqdm
+from skrl.trainers.torch import SequentialTrainer
+
+# skrl >= ~2.x removed the ``agents_scope`` keyword from
+# ``SequentialTrainer.__init__``. Detect once at import time so the wrapper
+# stays compatible with both old and new versions without try/except per
+# call site.
+_SUPER_INIT_PARAMS = inspect.signature(SequentialTrainer.__init__).parameters
+_SUPPORTS_AGENTS_SCOPE = "agents_scope" in _SUPER_INIT_PARAMS
+
+
+class BenchmarkTrainer(SequentialTrainer):
+    """SequentialTrainer that records per-iteration timing + reward + ep length."""
+
+    def __init__(self, env, agents, agents_scope=None, cfg=None) -> None:
+        if _SUPPORTS_AGENTS_SCOPE:
+            super().__init__(env=env, agents=agents, agents_scope=agents_scope, cfg=cfg)
+        else:
+            super().__init__(env=env, agents=agents, cfg=cfg)
+        self.iter_times_s: list[float] = []
+        self.iter_rewards: list[float] = []
+        self.iter_ep_lengths: list[float] = []
+
+    def train(self) -> None:
+        # Exactly one non-simultaneous single-agent training path — mirrors
+        # the parent SequentialTrainer for that case. If the user is running
+        # multi-agent or simultaneous agents, defer to the stock loop (those
+        # paths don't populate the per-iteration benchmark attributes).
+        if self.num_simultaneous_agents > 1 or self.env.num_agents > 1:
+            super().train()
+            return
+
+        rollouts_attr = getattr(self.agents, "_rollouts", None)
+        if not rollouts_attr:
+            # Agent has no rollout boundary (e.g. off-policy SAC/DDPG).
+            # Defer to the stock training loop — the per-iter attributes
+            # stay empty, and benchmark_skrl.py will treat that as "no
+            # per-iter data available" rather than wall-time garbage.
+            super().train()
+            return
+        rollouts = int(rollouts_attr)
+        max_iters = self.timesteps // rollouts
+
+        self.agents.set_running_mode("train")
+        states, infos = self.env.reset()
+
+        iter_start_ns = time.perf_counter_ns()
+        rollout_reward_sum = 0.0
+        rollout_reward_count = 0
+
+        for timestep in tqdm.tqdm(
+            range(self.initial_timestep, self.timesteps),
+            disable=self.disable_progressbar,
+        ):
+            self.agents.pre_interaction(timestep=timestep, timesteps=self.timesteps)
+
+            with torch.no_grad():
+                actions = self.agents.act(states, timestep=timestep, timesteps=self.timesteps)[0]
+                next_states, rewards, terminated, truncated, infos = self.env.step(actions)
+
+                if not self.headless:
+                    self.env.render()
+
+                self.agents.record_transition(
+                    states=states,
+                    actions=actions,
+                    rewards=rewards,
+                    next_states=next_states,
+                    terminated=terminated,
+                    truncated=truncated,
+                    infos=infos,
+                    timestep=timestep,
+                    timesteps=self.timesteps,
+                )
+
+                if self.environment_info in infos:
+                    for k, v in infos[self.environment_info].items():
+                        if isinstance(v, torch.Tensor) and v.numel() == 1:
+                            self.agents.track_data(f"Info / {k}", v.item())
+
+                rollout_reward_sum += float(rewards.mean().item())
+                rollout_reward_count += 1
+
+            self.agents.post_interaction(timestep=timestep, timesteps=self.timesteps)
+
+            # Reset envs only when running a single env; multi-env VecEnvs
+            # handle per-env resets themselves. Mirrors
+            # skrl.trainers.torch.base.Trainer.single_agent_train.
+            if self.env.num_envs > 1:
+                states = next_states
+            else:
+                if terminated.any() or truncated.any():
+                    with torch.no_grad():
+                        states, infos = self.env.reset()
+                else:
+                    states = next_states
+
+            # One iteration = one rollout-buffer fill.
+            if (timestep + 1) % rollouts == 0:
+                iter_end_ns = time.perf_counter_ns()
+                self.iter_times_s.append((iter_end_ns - iter_start_ns) / 1e9)
+                mean_reward = rollout_reward_sum / max(rollout_reward_count, 1)
+                self.iter_rewards.append(mean_reward)
+                ep_len_samples = self.agents.tracking_data.get("Episode / Total timesteps (mean)", [])
+                self.iter_ep_lengths.append(float(ep_len_samples[-1]) if ep_len_samples else 0.0)
+                # Reset per-iter accumulators + timer for the next rollout.
+                iter_start_ns = time.perf_counter_ns()
+                rollout_reward_sum = 0.0
+                rollout_reward_count = 0
+
+        # Cap any series to max_iters (guards against off-by-one if timesteps
+        # isn't a clean multiple of rollouts).
+        self.iter_times_s = self.iter_times_s[:max_iters]
+        self.iter_rewards = self.iter_rewards[:max_iters]
+        self.iter_ep_lengths = self.iter_ep_lengths[:max_iters]
diff --git a/scripts/benchmarks/startup_whitelist.yaml b/scripts/benchmarks/startup_whitelist.yaml
index 121718d36b40..00163c27d328 100644
--- a/scripts/benchmarks/startup_whitelist.yaml
+++ b/scripts/benchmarks/startup_whitelist.yaml
@@ -3,23 +3,44 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
+# Per-phase function whitelist for benchmark_startup.py. Patterns are
+# fnmatch-style; patterns matching no function emit a placeholder row
+# (tottime=0, cumtime=0, ncalls=0) so downstream dashboards always receive
+# consistent keys.
+#
+# A phase MAY be absent from this file — in that case benchmark_startup
+# falls back to top_n selection (default: 30). Phases documented below
+# that intentionally fall through say so explicitly in a comment rather
+# than listing patterns.
+#
+# python_imports: intentional top_n fallback — all top-30 functions have
+#   own_time < 7.1 ms; dominant entries are import-machinery internals
+#   (<frozen importlib._bootstrap>:_find_and_load) and generic builtins,
+#   not stable IsaacLab symbols worth tracking on a dashboard.
+#
+# first_step: intentional top_n fallback — total wall-time is only 0.19 s;
+#   the single function above 10 ms is task-specific
+#   (locomotion_env:_get_rewards at 15 ms) and will differ across tasks,
+#   making a fixed whitelist fragile.
+
 app_launch:
   - "isaaclab.utils.configclass:_wrap_resolvable_strings"
   - "isaaclab.utils.configclass:_custom_post_init"
   - "isaaclab.utils.configclass:_field_module_dir"
+  - "lib.python3.12.copy:deepcopy"
+
+task_config:
+  - "isaaclab.utils.configclass:configclass"
+  - "isaaclab.utils.configclass:_custom_post_init"
+  - "isaaclab.utils.configclass:_wrap_resolvable_strings"
+  - "isaaclab.utils.configclass:_process_mutable_types"
+  - "lib.python3.12.copy:deepcopy"
 
 env_creation:
-  - "isaaclab.cloner.*:usd_replicate"
-  - "isaaclab.cloner.*:filter_collisions"
-  - "isaaclab_physx.cloner.*:attach_end_fn"
-  - "isaaclab.scene.*:_init_scene"
-  - "isaaclab.envs.mdp.observations:*"
+  - "isaaclab.sim.utils.prims:wrapper"
+  - "isaaclab.cloner.cloner_utils:usd_replicate"
+  - "isaaclab_physx.cloner.physx_replicate:attach_end_fn"
+  - "isaaclab_physx.cloner.physx_replicate:physx_replicate"
+  - "isaaclab.cloner.cloner_utils:grid_transforms"
+  - "isaaclab.sim.utils.queries:find_matching_prims"
   - "isaaclab.utils.assets:_find_usd_dependencies"
-
-first_step:
-  - "isaaclab.envs.mdp.rewards:*"
-  - "isaaclab.envs.mdp.terminations:*"
-  - "isaaclab.envs.mdp.observations:*"
-  - "isaaclab.actuators.*:compute"
-  - "warp.*:launch"
-  - "warp.*:to_torch"
diff --git a/scripts/benchmarks/tests/__init__.py b/scripts/benchmarks/tests/__init__.py
new file mode 100644
index 000000000000..460a30569089
--- /dev/null
+++ b/scripts/benchmarks/tests/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
diff --git a/scripts/benchmarks/tests/test_action_sampling.py b/scripts/benchmarks/tests/test_action_sampling.py
new file mode 100644
index 000000000000..3910ae4c2f89
--- /dev/null
+++ b/scripts/benchmarks/tests/test_action_sampling.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Unit tests for :func:`scripts.benchmarks._action_sampling.sample_random_actions`."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+import numpy as np
+import torch
+
+from scripts.benchmarks._action_sampling import sample_random_actions
+
+
+@dataclass
+class _BoxSpace:
+    """Minimal stand-in for ``gym.spaces.Box``. Only needs ``.sample()`` for
+    these tests; we duck-type to avoid pulling gymnasium into the unit-test
+    path (gymnasium is installed inside Isaac Sim's python, not the system
+    one used by ``python3 -m pytest``)."""
+
+    low: float
+    high: float
+    shape: tuple
+
+    def sample(self) -> np.ndarray:
+        return np.random.uniform(low=self.low, high=self.high, size=self.shape).astype(np.float32)
+
+
+def _box(low: float = -1.0, high: float = 1.0, shape: tuple = (3,)) -> _BoxSpace:
+    return _BoxSpace(low=low, high=high, shape=shape)
+
+
+@dataclass
+class _FakeSingleAgentEnv:
+    """Mimic the unwrapped surface of a DirectRLEnv / ManagerBasedRLEnv."""
+
+    num_envs: int = 4
+    device: str = "cpu"
+    single_action_space: _BoxSpace = field(default_factory=_box)
+
+    @property
+    def unwrapped(self):
+        return self
+
+
+@dataclass
+class _FakeMARLEnv:
+    """Mimic the unwrapped surface of a DirectMARLEnv."""
+
+    num_envs: int = 4
+    device: str = "cpu"
+    action_spaces: dict = field(
+        default_factory=lambda: {
+            "cart": _box(shape=(1,)),
+            "pendulum": _box(shape=(1,)),
+        }
+    )
+
+    @property
+    def unwrapped(self):
+        return self
+
+
+def test_sample_random_actions_single_agent_returns_stacked_tensor():
+    """Single-agent envs must get one tensor of shape ``(num_envs, action_dim)`` —
+    ``env.step`` of a DirectRLEnv expects a single tensor, not a dict."""
+    env = _FakeSingleAgentEnv(num_envs=8)
+    actions = sample_random_actions(env)
+    assert isinstance(actions, torch.Tensor)
+    assert actions.shape == (8, 3)
+    assert actions.dtype == torch.float32
+
+
+def test_sample_random_actions_multi_agent_returns_dict():
+    """Multi-agent envs must get a dict ``{agent_id: tensor}`` — that's the
+    shape ``DirectMARLEnv.step`` accepts. The previous code path called
+    ``unwrapped.single_action_space.sample()`` and crashed with
+    ``AttributeError: 'CartDoublePendulumEnv' object has no attribute
+    'single_action_space'`` on every multi-agent benchmark run."""
+    env = _FakeMARLEnv(num_envs=4)
+    actions = sample_random_actions(env)
+    assert isinstance(actions, dict)
+    assert set(actions) == {"cart", "pendulum"}
+    for agent, tensor in actions.items():
+        assert isinstance(tensor, torch.Tensor)
+        assert tensor.shape == (4, 1)
+        assert tensor.dtype == torch.float32
+
+
+def test_sample_random_actions_multi_agent_handles_heterogeneous_action_dims():
+    """Per-agent action spaces can have different shapes — the helper must
+    sample each space at its own dimensionality, not assume a uniform
+    shape across agents."""
+    env = _FakeMARLEnv(
+        num_envs=2,
+        action_spaces={
+            "small": _box(shape=(1,)),
+            "large": _box(shape=(7,)),
+        },
+    )
+    actions = sample_random_actions(env)
+    assert actions["small"].shape == (2, 1)
+    assert actions["large"].shape == (2, 7)
+
+
+def test_sample_random_actions_multi_agent_samples_within_space_bounds():
+    """Sanity-check that the sampled values come from the declared Box —
+    catches a regression where someone replaces ``space.sample()`` with
+    e.g. zeros."""
+    env = _FakeMARLEnv(
+        num_envs=16,
+        action_spaces={
+            "agent": _box(low=-2.0, high=2.0, shape=(1,)),
+        },
+    )
+    actions = sample_random_actions(env)
+    a = actions["agent"]
+    assert (a >= -2.0).all() and (a <= 2.0).all()
+
+
+def test_sample_random_actions_uses_env_device_for_returned_tensors():
+    """Per the original code, the returned tensors live on
+    ``env.device``; otherwise ``env.step(actions)`` will copy from CPU
+    to GPU on every benchmark run and skew the timing."""
+    env = _FakeSingleAgentEnv(device="cpu")  # MPS/CUDA not assumed in tests
+    actions = sample_random_actions(env)
+    assert str(actions.device) == "cpu"
+
+
+def test_sample_random_actions_passes_through_gym_wrappers():
+    """The benchmark runs against a ``gym.make()``-wrapped env; the
+    action-space discriminator must read off ``env.unwrapped`` rather
+    than the wrapper, otherwise a single-agent gym.Wrapper exposing a
+    legacy ``action_spaces`` attribute (Wrapper has none, but
+    defensive) wouldn't trick us into the MARL branch."""
+
+    @dataclass
+    class _Wrapper:
+        inner: object
+
+        @property
+        def unwrapped(self):
+            return self.inner
+
+    env = _Wrapper(inner=_FakeSingleAgentEnv())
+    actions = sample_random_actions(env)
+    assert isinstance(actions, torch.Tensor)
+
+
+def test_sample_random_actions_marl_per_env_independence():
+    """Each row in the per-agent action tensor must be an independent
+    sample — i.e., sampling N times produces N (likely) distinct rows.
+    A regression where the loop replaced ``range(num_envs)`` with a
+    single sample broadcasted across rows would slip past the shape
+    check but produce trivially correlated actions across envs."""
+    env = _FakeMARLEnv(
+        num_envs=64,
+        action_spaces={"a": _box(shape=(2,))},
+    )
+    actions = sample_random_actions(env)
+    a = actions["a"].numpy()
+    # With 64 i.i.d. samples from a continuous Box, np.unique row count
+    # is overwhelmingly likely to be 64. Allow some slack just in case
+    # of pathological RNG state.
+    assert len({tuple(r) for r in a}) >= 60
diff --git a/scripts/benchmarks/tests/test_benchmark_rsl_rl_cli.py b/scripts/benchmarks/tests/test_benchmark_rsl_rl_cli.py
new file mode 100644
index 000000000000..66532631d14a
--- /dev/null
+++ b/scripts/benchmarks/tests/test_benchmark_rsl_rl_cli.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""CLI-level tests for benchmark_rsl_rl.py.
+
+These tests exercise only the argparse layer — they do not import the
+whole script (which launches Isaac Sim at import time). A minimal reimport
+of the argparse setup is shared via ``_build_parser``.
+"""
+
+from __future__ import annotations
+
+import argparse
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    """Mirror of the parser setup in benchmark_rsl_rl.py.
+
+    Kept in lockstep with the script; when a new flag is added there,
+    add it here too.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task", type=str)
+    parser.add_argument("--num_envs", type=int)
+    parser.add_argument("--seed", type=int)
+    parser.add_argument("--max_iterations", type=int)
+    parser.add_argument("--backend", choices=["physx", "newton"], default=None)
+    parser.add_argument("--run_id", type=str, default=None)
+    parser.add_argument("--schema_v1_output", type=str, default=None)
+    parser.add_argument("--log_dir", type=str, default=None)
+    return parser
+
+
+def test_log_dir_flag_defaults_none():
+    args = _build_parser().parse_args([])
+    assert args.log_dir is None
+
+
+def test_log_dir_flag_captured():
+    args = _build_parser().parse_args(["--log_dir", "/tmp/bundle/training_data"])
+    assert args.log_dir == "/tmp/bundle/training_data"
+
+
+def _inject_preset(args_cli, hydra_args: list[str]) -> list[str]:
+    """Mirror of the inject_preset logic in benchmark_rsl_rl.py.
+
+    Invariant: when --backend X is set AND hydra_args does NOT already
+    contain a ``presets=...`` entry, prepend ``presets=X``.
+    """
+    if args_cli.backend is None:
+        return hydra_args
+    existing = [a for a in hydra_args if a.startswith("presets=")]
+    if existing:
+        print(f"[WARNING] --backend={args_cli.backend} ignored; explicit {existing[0]} wins.")
+        return hydra_args
+    return [f"presets={args_cli.backend}"] + hydra_args
+
+
+def test_backend_injects_preset_when_none_given():
+    args = _build_parser().parse_args(["--backend", "newton"])
+    out = _inject_preset(args, ["env.decimation=4"])
+    assert out == ["presets=newton", "env.decimation=4"]
+
+
+def test_backend_does_not_inject_when_preset_already_present(capsys):
+    args = _build_parser().parse_args(["--backend", "newton"])
+    out = _inject_preset(args, ["presets=custom", "env.decimation=4"])
+    assert out == ["presets=custom", "env.decimation=4"]
+    assert "ignored" in capsys.readouterr().out
+
+
+def test_backend_unset_is_noop():
+    args = _build_parser().parse_args([])
+    out = _inject_preset(args, ["env.decimation=4"])
+    assert out == ["env.decimation=4"]
+
+
+def _inject_preset_with_validation(args_cli, hydra_args: list[str], has_physics_preset_fn) -> list[str]:
+    """Mirror of the new gated injection in benchmark_rsl_rl.py.
+
+    has_physics_preset_fn is the only injection point — the test passes
+    a stub returning True / False; the production caller passes the real
+    has_physics_preset(raw_cfg, name) closure.
+    """
+    import sys
+
+    if args_cli.backend is None:
+        return hydra_args
+    existing = [a for a in hydra_args if a.startswith("presets=")]
+    if existing:
+        print(f"[WARNING] --backend={args_cli.backend} ignored; explicit {existing[0]} wins.")
+        return hydra_args
+    if not has_physics_preset_fn(args_cli.backend):
+        sys.stderr.write(
+            f"[ERROR] preset_unsupported: task {args_cli.task!r} has no "
+            f"{args_cli.backend!r} preset. Inspect raw_cfg.sim.physics or "
+            f"re-enumerate {{physx,newton}}_envs.yaml.\n"
+        )
+        sys.exit(2)
+    return [f"presets={args_cli.backend}"] + hydra_args
+
+
+def test_validation_blocks_unsupported_preset(capsys):
+    args = _build_parser().parse_args(["--task", "Isaac-Foo-v0", "--backend", "physx"])
+    import pytest
+
+    with pytest.raises(SystemExit) as exc_info:
+        _inject_preset_with_validation(args, ["env.x=1"], has_physics_preset_fn=lambda name: False)
+    assert exc_info.value.code == 2
+    captured = capsys.readouterr()
+    assert "preset_unsupported:" in captured.err
+    assert "Isaac-Foo-v0" in captured.err
+
+
+def test_validation_passes_when_supported():
+    args = _build_parser().parse_args(["--task", "Isaac-Bar-v0", "--backend", "newton"])
+    out = _inject_preset_with_validation(args, ["env.x=1"], has_physics_preset_fn=lambda name: True)
+    assert out == ["presets=newton", "env.x=1"]
+
+
+def test_validation_skipped_when_explicit_preset_present(capsys):
+    """Explicit presets= in hydra_args bypasses validation (operator override)."""
+    args = _build_parser().parse_args(["--task", "Isaac-Foo-v0", "--backend", "physx"])
+
+    def _bomb(name: str) -> bool:
+        raise AssertionError("validator must not run when explicit preset is present")
+
+    out = _inject_preset_with_validation(args, ["presets=custom", "env.x=1"], has_physics_preset_fn=_bomb)
+    assert out == ["presets=custom", "env.x=1"]
+    assert "ignored" in capsys.readouterr().out
+
+
+def _inject_preset_with_validation_v2(
+    args_cli,
+    hydra_args: list[str],
+    has_physics_preset_fn,
+    native_backend_matches_fn,
+) -> list[str]:
+    """Mirror of the new gated injection in benchmark_rsl_rl.py (post native-backend fix).
+
+    Two stub injection points:
+      - has_physics_preset_fn(name) -> bool (existing)
+      - native_backend_matches_fn(name) -> bool (new)
+    """
+    import sys
+
+    if args_cli.backend is None:
+        return hydra_args
+    existing = [a for a in hydra_args if a.startswith("presets=")]
+    if existing:
+        print(f"[WARNING] --backend={args_cli.backend} ignored; explicit {existing[0]} wins.")
+        return hydra_args
+    if has_physics_preset_fn(args_cli.backend):
+        return [f"presets={args_cli.backend}"] + hydra_args
+    if native_backend_matches_fn(args_cli.backend):
+        print(
+            f"[INFO] task {args_cli.task!r} has no '{args_cli.backend}' preset; "
+            f"running on native {args_cli.backend} backend (no injection).",
+            file=sys.stderr,
+        )
+        return hydra_args
+    sys.stderr.write(
+        f"[ERROR] preset_unsupported: task {args_cli.task!r} has no "
+        f"{args_cli.backend!r} preset. Inspect raw_cfg.sim.physics or "
+        f"re-enumerate {{physx,newton}}_envs.yaml.\n"
+    )
+    raise SystemExit(2)
+
+
+def test_validation_skips_injection_when_native_matches(capsys):
+    """No preset, but cfg type matches request → run with no injection + [INFO] log."""
+    args = _build_parser().parse_args(["--task", "Isaac-Quadcopter-Direct-v0", "--backend", "physx"])
+    out = _inject_preset_with_validation_v2(
+        args,
+        ["env.x=1"],
+        has_physics_preset_fn=lambda name: False,
+        native_backend_matches_fn=lambda name: True,
+    )
+    assert out == ["env.x=1"]
+    captured = capsys.readouterr()
+    assert "running on native physx" in captured.err
+    assert "no injection" in captured.err
+
+
+def test_validation_still_blocks_when_native_mismatches(capsys):
+    """No preset AND cfg type doesn't match → existing exit-2 + preset_unsupported: stderr (regression)."""
+    args = _build_parser().parse_args(["--task", "Isaac-NewtonOnly-v0", "--backend", "physx"])
+    import pytest
+
+    with pytest.raises(SystemExit) as exc_info:
+        _inject_preset_with_validation_v2(
+            args,
+            ["env.x=1"],
+            has_physics_preset_fn=lambda name: False,
+            native_backend_matches_fn=lambda name: False,
+        )
+    assert exc_info.value.code == 2
+    captured = capsys.readouterr()
+    assert "preset_unsupported:" in captured.err
diff --git a/scripts/benchmarks/tests/test_benchmark_skrl_cli.py b/scripts/benchmarks/tests/test_benchmark_skrl_cli.py
new file mode 100644
index 000000000000..7ca5fdb1a2e8
--- /dev/null
+++ b/scripts/benchmarks/tests/test_benchmark_skrl_cli.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""CLI-level tests for benchmark_skrl.py — argparse-only, no Isaac Sim."""
+
+from __future__ import annotations
+
+import argparse
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task", type=str)
+    parser.add_argument("--num_envs", type=int)
+    parser.add_argument("--seed", type=int)
+    parser.add_argument("--max_iterations", type=int)
+    parser.add_argument("--backend", choices=["physx", "newton"], default=None)
+    parser.add_argument("--run_id", type=str, default=None)
+    parser.add_argument("--schema_v1_output", type=str, default=None)
+    parser.add_argument("--log_dir", type=str, default=None)
+    parser.add_argument("--ml_framework", type=str, default="torch")
+    return parser
+
+
+def _inject_preset(args_cli, hydra_args: list[str]) -> list[str]:
+    if args_cli.backend is None:
+        return hydra_args
+    existing = [a for a in hydra_args if a.startswith("presets=")]
+    if existing:
+        print(f"[WARNING] --backend={args_cli.backend} ignored; explicit {existing[0]} wins.")
+        return hydra_args
+    return [f"presets={args_cli.backend}"] + hydra_args
+
+
+def test_log_dir_flag_defaults_none():
+    assert _build_parser().parse_args([]).log_dir is None
+
+
+def test_log_dir_flag_captured():
+    args = _build_parser().parse_args(["--log_dir", "/tmp/bundle/training_data"])
+    assert args.log_dir == "/tmp/bundle/training_data"
+
+
+def test_backend_injects_preset_when_none_given():
+    args = _build_parser().parse_args(["--backend", "newton"])
+    assert _inject_preset(args, ["env.decimation=4"]) == ["presets=newton", "env.decimation=4"]
+
+
+def test_backend_does_not_inject_when_preset_already_present(capsys):
+    args = _build_parser().parse_args(["--backend", "newton"])
+    out = _inject_preset(args, ["presets=custom", "env.decimation=4"])
+    assert out == ["presets=custom", "env.decimation=4"]
+    assert "ignored" in capsys.readouterr().out
+
+
+def test_backend_unset_is_noop():
+    args = _build_parser().parse_args([])
+    assert _inject_preset(args, ["env.decimation=4"]) == ["env.decimation=4"]
+
+
+def _inject_preset_with_validation(args_cli, hydra_args: list[str], has_physics_preset_fn) -> list[str]:
+    """Mirror of the new gated injection in benchmark_skrl.py.
+
+    has_physics_preset_fn is the only injection point — the test passes
+    a stub returning True / False; the production caller passes the real
+    has_physics_preset(raw_cfg, name) closure.
+    """
+    import sys
+
+    if args_cli.backend is None:
+        return hydra_args
+    existing = [a for a in hydra_args if a.startswith("presets=")]
+    if existing:
+        print(f"[WARNING] --backend={args_cli.backend} ignored; explicit {existing[0]} wins.")
+        return hydra_args
+    if not has_physics_preset_fn(args_cli.backend):
+        sys.stderr.write(
+            f"[ERROR] preset_unsupported: task {args_cli.task!r} has no "
+            f"{args_cli.backend!r} preset. Inspect raw_cfg.sim.physics or "
+            f"re-enumerate {{physx,newton}}_envs.yaml.\n"
+        )
+        sys.exit(2)
+    return [f"presets={args_cli.backend}"] + hydra_args
+
+
+def test_validation_blocks_unsupported_preset(capsys):
+    args = _build_parser().parse_args(["--task", "Isaac-Foo-v0", "--backend", "physx"])
+    import pytest
+
+    with pytest.raises(SystemExit) as exc_info:
+        _inject_preset_with_validation(args, ["env.x=1"], has_physics_preset_fn=lambda name: False)
+    assert exc_info.value.code == 2
+    captured = capsys.readouterr()
+    assert "preset_unsupported:" in captured.err
+    assert "Isaac-Foo-v0" in captured.err
+
+
+def test_validation_passes_when_supported():
+    args = _build_parser().parse_args(["--task", "Isaac-Bar-v0", "--backend", "newton"])
+    out = _inject_preset_with_validation(args, ["env.x=1"], has_physics_preset_fn=lambda name: True)
+    assert out == ["presets=newton", "env.x=1"]
+
+
+def test_validation_skipped_when_explicit_preset_present(capsys):
+    """Explicit presets= in hydra_args bypasses validation (operator override)."""
+    args = _build_parser().parse_args(["--task", "Isaac-Foo-v0", "--backend", "physx"])
+
+    def _bomb(name: str) -> bool:
+        raise AssertionError("validator must not run when explicit preset is present")
+
+    out = _inject_preset_with_validation(args, ["presets=custom", "env.x=1"], has_physics_preset_fn=_bomb)
+    assert out == ["presets=custom", "env.x=1"]
+    assert "ignored" in capsys.readouterr().out
+
+
+def _inject_preset_with_validation_v2(
+    args_cli,
+    hydra_args: list[str],
+    has_physics_preset_fn,
+    native_backend_matches_fn,
+) -> list[str]:
+    """Mirror of the new gated injection in benchmark_skrl.py (post native-backend fix).
+
+    Two stub injection points:
+      - has_physics_preset_fn(name) -> bool (existing)
+      - native_backend_matches_fn(name) -> bool (new)
+    """
+    import sys
+
+    if args_cli.backend is None:
+        return hydra_args
+    existing = [a for a in hydra_args if a.startswith("presets=")]
+    if existing:
+        print(f"[WARNING] --backend={args_cli.backend} ignored; explicit {existing[0]} wins.")
+        return hydra_args
+    if has_physics_preset_fn(args_cli.backend):
+        return [f"presets={args_cli.backend}"] + hydra_args
+    if native_backend_matches_fn(args_cli.backend):
+        print(
+            f"[INFO] task {args_cli.task!r} has no '{args_cli.backend}' preset; "
+            f"running on native {args_cli.backend} backend (no injection).",
+            file=sys.stderr,
+        )
+        return hydra_args
+    sys.stderr.write(
+        f"[ERROR] preset_unsupported: task {args_cli.task!r} has no "
+        f"{args_cli.backend!r} preset. Inspect raw_cfg.sim.physics or "
+        f"re-enumerate {{physx,newton}}_envs.yaml.\n"
+    )
+    raise SystemExit(2)
+
+
+def test_validation_skips_injection_when_native_matches(capsys):
+    """No preset, but cfg type matches request → run with no injection + [INFO] log."""
+    args = _build_parser().parse_args(["--task", "Isaac-Quadcopter-Direct-v0", "--backend", "physx"])
+    out = _inject_preset_with_validation_v2(
+        args,
+        ["env.x=1"],
+        has_physics_preset_fn=lambda name: False,
+        native_backend_matches_fn=lambda name: True,
+    )
+    assert out == ["env.x=1"]
+    captured = capsys.readouterr()
+    assert "running on native physx" in captured.err
+    assert "no injection" in captured.err
+
+
+def test_validation_still_blocks_when_native_mismatches(capsys):
+    """No preset AND cfg type doesn't match → existing exit-2 + preset_unsupported: stderr (regression)."""
+    args = _build_parser().parse_args(["--task", "Isaac-NewtonOnly-v0", "--backend", "physx"])
+    import pytest
+
+    with pytest.raises(SystemExit) as exc_info:
+        _inject_preset_with_validation_v2(
+            args,
+            ["env.x=1"],
+            has_physics_preset_fn=lambda name: False,
+            native_backend_matches_fn=lambda name: False,
+        )
+    assert exc_info.value.code == 2
+    captured = capsys.readouterr()
+    assert "preset_unsupported:" in captured.err
+
+
+def _compose_experiment_dir(directory: str, experiment_name: str, agent_classname: str = "PPO") -> str:
+    """Mirror of SKRL BaseAgent.__init__'s experiment-dir composition.
+
+    Replicates the falsy-string fallback so tests can assert the final
+    ``experiment_dir`` a real SKRL agent would pick.
+    """
+    import datetime
+    import os
+
+    if not directory:
+        directory = os.path.join(os.getcwd(), "runs")
+    if not experiment_name:
+        experiment_name = "{}_{}".format(datetime.datetime.now().strftime("%y-%m-%d_%H-%M-%S-%f"), agent_classname)
+    return os.path.join(directory, experiment_name)
+
+
+def _apply_log_dir_override(log_dir_arg: str) -> dict:
+    """Mirror of the agent_cfg mutation in benchmark_skrl.py's log_dir branch."""
+    import os
+
+    log_dir = os.path.abspath(log_dir_arg)
+    return {
+        "directory": os.path.dirname(log_dir) or ".",
+        "experiment_name": os.path.basename(log_dir),
+    }
+
+
+def test_log_dir_override_recomposes_to_exact_path():
+    """The override must make experiment_dir equal the absolute log_dir."""
+    import os
+
+    log_dir = "/tmp/bundle_xyz/training_data"
+    override = _apply_log_dir_override(log_dir)
+    composed = _compose_experiment_dir(override["directory"], override["experiment_name"])
+    assert composed == os.path.abspath(log_dir), (
+        f"experiment_dir {composed!r} != {log_dir!r}; SKRL will silently "
+        f"interpose a timestamp subdir when experiment_name is empty."
+    )
+
+
+def test_log_dir_override_handles_trailing_slash():
+    """Trailing slash on --log_dir should not corrupt the basename split."""
+    import os
+
+    log_dir = "/tmp/bundle_abc/training_data/"
+    override = _apply_log_dir_override(log_dir)
+    composed = _compose_experiment_dir(override["directory"], override["experiment_name"])
+    assert composed == os.path.abspath(log_dir)
diff --git a/scripts/benchmarks/tests/test_skrl_benchmark_trainer.py b/scripts/benchmarks/tests/test_skrl_benchmark_trainer.py
new file mode 100644
index 000000000000..7fb6672d6327
--- /dev/null
+++ b/scripts/benchmarks/tests/test_skrl_benchmark_trainer.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Unit tests for BenchmarkTrainer — run with a fake env and fake agent.
+
+These tests do NOT spin up Isaac Sim. They verify the trainer's
+per-iteration capture logic in isolation.
+"""
+
+from __future__ import annotations
+
+import time
+
+import pytest
+import torch
+
+from scripts.benchmarks.skrl_benchmark_trainer import BenchmarkTrainer
+
+
+class _FakeEnv:
+    """Minimal env compatible with SKRL's SequentialTrainer expectations."""
+
+    num_agents = 1
+    num_envs = 4
+    state_space = None
+    observation_space = type("O", (), {"shape": (2,)})()
+    action_space = type("A", (), {"shape": (1,)})()
+    device = torch.device("cpu")
+
+    def __init__(self, reward_schedule):
+        self._rewards = reward_schedule  # list[float] — one per step
+        self._i = 0
+
+    def reset(self):
+        return torch.zeros(self.num_envs, 2), {}
+
+    def step(self, actions):
+        r = self._rewards[self._i % len(self._rewards)]
+        self._i += 1
+        rewards = torch.full((self.num_envs,), float(r))
+        terminated = torch.zeros(self.num_envs, dtype=torch.bool)
+        truncated = torch.zeros(self.num_envs, dtype=torch.bool)
+        next_states = torch.zeros(self.num_envs, 2)
+        return next_states, rewards, terminated, truncated, {}
+
+    def render(self):
+        pass
+
+    def close(self):
+        pass
+
+
+class _FakeAgent:
+    """Minimal agent that exposes `_rollouts`, pre/post_interaction, track_data."""
+
+    def __init__(self, rollouts: int = 4):
+        self._rollouts = rollouts
+        self.tracking_data: dict[str, list[float]] = {}
+        self._init_called = False
+        self._running_mode = None
+
+    def init(self, trainer_cfg):
+        self._init_called = True
+
+    def set_running_mode(self, mode):
+        self._running_mode = mode
+
+    def pre_interaction(self, timestep, timesteps):
+        pass
+
+    def act(self, states, timestep, timesteps):
+        return torch.zeros(states.shape[0], 1), None, None
+
+    def record_transition(self, **kwargs):
+        pass
+
+    def post_interaction(self, timestep, timesteps):
+        pass
+
+    def track_data(self, tag, value):
+        self.tracking_data.setdefault(tag, []).append(value)
+
+
+def test_iter_times_s_length_matches_iterations():
+    rollouts = 4
+    max_iters = 3
+    env = _FakeEnv(reward_schedule=[1.0] * 100)
+    agent = _FakeAgent(rollouts=rollouts)
+    trainer_cfg = {"timesteps": rollouts * max_iters, "headless": True}
+
+    trainer = BenchmarkTrainer(env=env, agents=agent, cfg=trainer_cfg)
+    trainer.train()
+
+    assert len(trainer.iter_times_s) == max_iters
+    assert all(t > 0.0 for t in trainer.iter_times_s)
+
+
+def test_iter_rewards_reflects_synthetic_schedule():
+    rollouts = 4
+    max_iters = 3
+    # Give each rollout a distinguishable reward value.
+    schedule = [1.0] * rollouts + [2.0] * rollouts + [3.0] * rollouts
+    env = _FakeEnv(reward_schedule=schedule)
+    agent = _FakeAgent(rollouts=rollouts)
+    trainer_cfg = {"timesteps": rollouts * max_iters, "headless": True}
+
+    trainer = BenchmarkTrainer(env=env, agents=agent, cfg=trainer_cfg)
+    trainer.train()
+
+    # Each iteration's mean reward = mean over rollouts*num_envs rewards.
+    # For constant-per-rollout schedules: iter k ≈ schedule[k*rollouts].
+    assert trainer.iter_rewards == pytest.approx([1.0, 2.0, 3.0])
+
+
+def test_iter_ep_lengths_defaults_to_zero_when_no_termination():
+    rollouts = 4
+    max_iters = 2
+    env = _FakeEnv(reward_schedule=[0.0] * 100)
+    agent = _FakeAgent(rollouts=rollouts)
+    trainer_cfg = {"timesteps": rollouts * max_iters, "headless": True}
+
+    trainer = BenchmarkTrainer(env=env, agents=agent, cfg=trainer_cfg)
+    trainer.train()
+
+    # Fake env never terminates → ep_lengths fall back to 0.0 each iter.
+    assert trainer.iter_ep_lengths == [0.0, 0.0]
+
+
+def test_iter_times_s_shows_variance_with_sleep():
+    """Real per-iter timing must vary when iterations take different wall times."""
+    rollouts = 2
+    max_iters = 2
+
+    class _SlowEnv(_FakeEnv):
+        def step(self, actions):
+            if self._i == 0 or self._i == 1:
+                time.sleep(0.02)
+            return super().step(actions)
+
+    env = _SlowEnv(reward_schedule=[0.0] * 100)
+    agent = _FakeAgent(rollouts=rollouts)
+    trainer_cfg = {"timesteps": rollouts * max_iters, "headless": True}
+
+    trainer = BenchmarkTrainer(env=env, agents=agent, cfg=trainer_cfg)
+    trainer.train()
+
+    assert len(trainer.iter_times_s) == max_iters
+    # First iter had two sleep(0.02) calls (steps 0 and 1); second iter didn't.
+    # Accept any positive separation; this is about existence of variance, not magnitude.
+    assert trainer.iter_times_s[0] > trainer.iter_times_s[1]
+
+
+def test_multi_env_does_not_call_env_reset_on_termination():
+    """Regression: Task 4's initial fix unconditionally reset on any termination,
+    which corrupts multi-env VecEnv training (parent's single_agent_train guards
+    this on num_envs > 1)."""
+    rollouts = 4
+    max_iters = 2
+
+    class _CountingMultiEnv(_FakeEnv):
+        num_envs = 8  # multi-env — parent must NOT mid-train reset
+
+        def __init__(self, reward_schedule):
+            super().__init__(reward_schedule=reward_schedule)
+            self.reset_calls = 0
+
+        def reset(self):
+            self.reset_calls += 1
+            return torch.zeros(self.num_envs, 2), {}
+
+        def step(self, actions):
+            r = self._rewards[self._i % len(self._rewards)]
+            self._i += 1
+            rewards = torch.full((self.num_envs,), float(r))
+            # Terminate env 0 on every step — should NOT trigger env.reset()
+            terminated = torch.zeros(self.num_envs, dtype=torch.bool)
+            terminated[0] = True
+            truncated = torch.zeros(self.num_envs, dtype=torch.bool)
+            next_states = torch.zeros(self.num_envs, 2)
+            return next_states, rewards, terminated, truncated, {}
+
+    env = _CountingMultiEnv(reward_schedule=[1.0] * 100)
+    agent = _FakeAgent(rollouts=rollouts)
+    trainer_cfg = {"timesteps": rollouts * max_iters, "headless": True}
+    trainer = BenchmarkTrainer(env=env, agents=agent, cfg=trainer_cfg)
+    trainer.train()
+
+    # Exactly one reset — the initial one at loop start.
+    assert env.reset_calls == 1, (
+        f"BenchmarkTrainer called env.reset() {env.reset_calls} times on a "
+        f"multi-env VecEnv. Parent single_agent_train only resets at start "
+        f"when num_envs > 1 — VecEnv handles per-env auto-reset internally."
+    )
+
+
+def test_single_env_resets_when_episode_ends():
+    """Sanity: the single-env branch still resets on termination."""
+    rollouts = 2
+    max_iters = 1
+
+    class _CountingSingleEnv(_FakeEnv):
+        num_envs = 1
+
+        def __init__(self, reward_schedule):
+            super().__init__(reward_schedule=reward_schedule)
+            self.reset_calls = 0
+
+        def reset(self):
+            self.reset_calls += 1
+            return torch.zeros(self.num_envs, 2), {}
+
+        def step(self, actions):
+            r = self._rewards[self._i % len(self._rewards)]
+            self._i += 1
+            rewards = torch.full((self.num_envs,), float(r))
+            terminated = torch.zeros(self.num_envs, dtype=torch.bool)
+            terminated[0] = True  # terminate every step on num_envs=1
+            truncated = torch.zeros(self.num_envs, dtype=torch.bool)
+            next_states = torch.zeros(self.num_envs, 2)
+            return next_states, rewards, terminated, truncated, {}
+
+    env = _CountingSingleEnv(reward_schedule=[0.0] * 100)
+    agent = _FakeAgent(rollouts=rollouts)
+    trainer_cfg = {"timesteps": rollouts * max_iters, "headless": True}
+    trainer = BenchmarkTrainer(env=env, agents=agent, cfg=trainer_cfg)
+    trainer.train()
+
+    # Initial reset (1) + per-step reset on each termination (rollouts=2)
+    # = 3 total.
+    assert env.reset_calls >= 2, f"Expected ≥2 resets on single-env terminations, got {env.reset_calls}"
diff --git a/scripts/benchmarks/utils.py b/scripts/benchmarks/utils.py
index 05effa524172..fd1b82cf981f 100644
--- a/scripts/benchmarks/utils.py
+++ b/scripts/benchmarks/utils.py
@@ -346,7 +346,7 @@ def parse_cprofile_stats(
     isaaclab_prefixes: list[str],
     top_n: int = 30,
     whitelist: list[str] | None = None,
-) -> list[tuple[str, float, float]]:
+) -> list[tuple[str, float, float, int]]:
     """Parse cProfile stats, filtering to IsaacLab + first-level external calls.
 
     Walks the pstats data and keeps functions that are either (a) inside an
@@ -368,8 +368,10 @@ def parse_cprofile_stats(
             functions (e.g. ``["isaaclab.cloner.*:usd_replicate"]``).
 
     Returns:
-        List of (function_label, tottime_ms, cumtime_ms) tuples sorted by
-        tottime descending.
+        List of ``(function_label, tottime_ms, cumtime_ms, ncalls)`` tuples
+        sorted by tottime descending. ``ncalls`` is the primitive (non-recursive)
+        call count reported by ``pstats.Stats.stats``. Whitelist placeholder
+        rows carry ``ncalls=0``.
     """
     import fnmatch
     import io
@@ -409,18 +411,18 @@ def _make_label(filename: str, funcname: str) -> str:
     # stats.stats: dict[(filename, lineno, funcname)] -> (pcalls, ncalls, tottime, cumtime, callers)
     # callers: dict[(filename, lineno, funcname)] -> (pcalls, ncalls, tottime, cumtime)
     results = []
-    for func_key, (_, _, tottime, cumtime, callers) in stats.stats.items():
+    for func_key, (_, ncalls, tottime, cumtime, callers) in stats.stats.items():
         filename, _, funcname = func_key
         if _is_isaaclab(filename):
             label = _make_label(filename, funcname)
-            results.append((label, tottime * 1000.0, cumtime * 1000.0))
+            results.append((label, tottime * 1000.0, cumtime * 1000.0, ncalls))
         else:
             # Check if any direct caller is an IsaacLab function
             for caller_key in callers:
                 caller_filename = caller_key[0]
                 if _is_isaaclab(caller_filename):
                     label = _make_label(filename, funcname)
-                    results.append((label, tottime * 1000.0, cumtime * 1000.0))
+                    results.append((label, tottime * 1000.0, cumtime * 1000.0, ncalls))
                     break
 
     # Sort by tottime (own-time) descending
@@ -430,23 +432,25 @@ def _make_label(filename: str, funcname: str) -> str:
         return results[:top_n]
 
     # Whitelist mode: filter by fnmatch patterns, emit placeholders for unmatched patterns
-    matched: dict[str, tuple[str, float, float]] = {}
+    matched: dict[str, tuple[str, float, float, int]] = {}
     matched_patterns: set[str] = set()
-    for label, tottime, cumtime in results:
+    for label, tottime, cumtime, ncalls in results:
         for pattern in whitelist:
             if fnmatch.fnmatch(label, pattern):
                 if label not in matched:
-                    matched[label] = (label, tottime, cumtime)
+                    matched[label] = (label, tottime, cumtime, ncalls)
                 matched_patterns.add(pattern)
 
-    # Add 0.0 placeholders for patterns that matched nothing
+    # Add 0.0 placeholders for patterns that matched nothing. Placeholder rows
+    # keep the schema shape (still a 4-tuple) and carry ncalls=0 — semantically
+    # "this pattern matched nothing, so no call count is meaningful."
     for pattern in whitelist:
         if pattern not in matched_patterns:
             print(
                 f"[WARNING] Whitelist pattern '{pattern}' matched no profiled functions. "
                 "Check for typos or verify the function ran during this phase."
             )
-            matched[pattern] = (pattern, 0.0, 0.0)
+            matched[pattern] = (pattern, 0.0, 0.0, 0)
 
     filtered = list(matched.values())
     filtered.sort(key=lambda x: x[1], reverse=True)
diff --git a/source/isaaclab/changelog.d/antoiner-feat-benchmark-scripts-v1.rst b/source/isaaclab/changelog.d/antoiner-feat-benchmark-scripts-v1.rst
new file mode 100644
index 000000000000..c4d851ea95a6
--- /dev/null
+++ b/source/isaaclab/changelog.d/antoiner-feat-benchmark-scripts-v1.rst
@@ -0,0 +1,26 @@
+Added
+^^^^^
+
+* Added an opt-in ``--schema_v1_output <path>`` flag to ``benchmark_startup.py``,
+  ``benchmark_rsl_rl.py``, and ``benchmark_skrl.py``. When set, each script
+  emits a self-contained ``training.json`` / ``startup.json`` JSON file
+  conforming to :mod:`isaaclab.benchmark.schema` (v1.0) — run identity,
+  software versions, host hardware, aggregated runtime + resource metrics,
+  and EMA-smoothed reward / episode-length curves. The legacy per-backend
+  output format remains the default when the flag is omitted.
+* Added ``benchmark_skrl.py``: the SKRL-framework counterpart to
+  ``benchmark_rsl_rl.py``. Emits an identical v1.0 ``TrainingBundle`` with
+  ``framework: "skrl"``.
+* Added :doc:`/source/features/benchmarking` documenting the three scripts
+  and the v1.0 bundle schema.
+
+Changed
+^^^^^^^
+
+* Extended :func:`scripts.benchmarks.utils.parse_cprofile_stats` to return a
+  4-tuple ``(function_label, tottime_ms, cumtime_ms, ncalls)`` instead of a
+  3-tuple, exposing the primitive call count from ``pstats`` for downstream
+  consumers. Existing tuple-unpacking call sites updated.
+* Reworked ``scripts/benchmarks/startup_whitelist.yaml`` to track the
+  IsaacLab v3 configclass / cloner / scene-init call paths and added an
+  explicit ``task_config`` phase entry.
diff --git a/source/isaaclab/test/benchmark/test_parse_cprofile_stats.py b/source/isaaclab/test/benchmark/test_parse_cprofile_stats.py
new file mode 100644
index 000000000000..48e7c4f956f7
--- /dev/null
+++ b/source/isaaclab/test/benchmark/test_parse_cprofile_stats.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Unit tests for :func:`scripts.benchmarks.utils.parse_cprofile_stats`.
+
+The function is expected to return 4-tuples
+``(label, tottime_ms, cumtime_ms, ncalls)`` after the T2.2 reliability fix.
+Before the fix, the function returned 3-tuples and CProfileFunction.calls was
+always 0 in the downstream startup bundle.
+"""
+
+from __future__ import annotations
+
+import cProfile
+import os
+import sys
+
+# scripts/benchmarks/utils.py is not an installable package; add the repo
+# root to sys.path so the import works.
+_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../.."))
+if _REPO_ROOT not in sys.path:
+    sys.path.insert(0, _REPO_ROOT)
+
+from scripts.benchmarks.utils import parse_cprofile_stats  # noqa: E402
+
+
+def _profiled_call(n_outer: int, n_inner: int) -> cProfile.Profile:
+    """Run a couple of nested helpers a known number of times under cProfile."""
+
+    def inner():
+        return sum(range(10))
+
+    def outer():
+        for _ in range(n_inner):
+            inner()
+
+    prof = cProfile.Profile()
+    prof.enable()
+    for _ in range(n_outer):
+        outer()
+    prof.disable()
+    return prof
+
+
+def test_top_n_returns_ncalls():
+    # The synthetic functions live in THIS test file, so _is_isaaclab will
+    # not match them — they come through the "first-level external call from
+    # an IsaacLab caller" path only if we pass this file's directory as an
+    # isaaclab prefix. Do so to include them.
+    test_dir = os.path.abspath(os.path.dirname(__file__))
+    prof = _profiled_call(n_outer=3, n_inner=5)
+
+    results = parse_cprofile_stats(prof, isaaclab_prefixes=[test_dir], top_n=30)
+
+    # Each row must be a 4-tuple now.
+    assert results, "parse_cprofile_stats should return at least one row"
+    for row in results:
+        assert len(row) == 4, f"expected (label, tot, cum, ncalls) 4-tuple, got {row!r}"
+        label, tot, cum, ncalls = row
+        assert isinstance(label, str)
+        assert isinstance(tot, float)
+        assert isinstance(cum, float)
+        assert isinstance(ncalls, int)
+        assert ncalls >= 0
+
+    # Locate our two functions by suffix and check their call counts.
+    outer_rows = [r for r in results if r[0].endswith(":outer")]
+    inner_rows = [r for r in results if r[0].endswith(":inner")]
+    assert outer_rows, f"outer() should be in results, got labels: {[r[0] for r in results]}"
+    assert inner_rows, f"inner() should be in results, got labels: {[r[0] for r in results]}"
+    assert outer_rows[0][3] == 3, f"outer ncalls should be 3, got {outer_rows[0][3]}"
+    assert inner_rows[0][3] == 15, f"inner ncalls should be 3*5=15, got {inner_rows[0][3]}"
+
+
+def test_whitelist_path_returns_ncalls():
+    test_dir = os.path.abspath(os.path.dirname(__file__))
+    prof = _profiled_call(n_outer=2, n_inner=4)
+
+    results = parse_cprofile_stats(
+        prof,
+        isaaclab_prefixes=[test_dir],
+        whitelist=["*:inner", "*:definitely_not_a_real_function"],
+    )
+
+    # Matched row carries the real ncalls; placeholder row carries 0.
+    labels = {r[0]: r for r in results}
+    inner_label = next((lbl for lbl in labels if lbl.endswith(":inner")), None)
+    assert inner_label is not None, f"inner() should match wildcard whitelist, labels: {list(labels)}"
+    assert labels[inner_label][3] == 8, f"inner ncalls should be 2*4=8, got {labels[inner_label][3]}"
+
+    placeholder = labels.get("*:definitely_not_a_real_function")
+    assert placeholder is not None, "placeholder row should be emitted for unmatched pattern"
+    assert placeholder == ("*:definitely_not_a_real_function", 0.0, 0.0, 0)