From d7493ad99c55e65259d627fa4ec967866a901cbc Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Thu, 14 May 2026 13:00:51 +0200 Subject: [PATCH 1/6] fix(gateway,#395): scope per-entity fault GET/DELETE by hosted-app FQNs Per-entity fault routes used the addressed entity's namespace_path as a transport-level prefix filter. When namespace_path was empty (host-derived or synthetic components, manifest components without an explicit `namespace` field, Areas, Functions), the filter short-circuited: - GET /{entity}/faults/{code} returned faults reported by apps that lived in entirely different entities. - DELETE /{entity}/faults/{code} cleared them, since ClearFault.srv has no scope argument and the handler did not check ownership first. Both handlers now resolve the entity to the set of FQNs of every hosted app via the new HandlerContext::resolve_entity_source_fqns helper and require the fault's reporting_sources to intersect that set. Empty set means "nothing in scope" -> 404. DELETE additionally fetches the fault and verifies scope before issuing the clear. The GetFault.srv / ClearFault.srv contracts are unchanged. --- src/ros2_medkit_gateway/CHANGELOG.rst | 1 + .../http/handlers/handler_context.hpp | 25 ++++ .../src/http/handlers/fault_handlers.cpp | 78 ++++++++++- .../src/http/handlers/handler_context.cpp | 53 +++++++ .../test/test_handler_context.cpp | 108 +++++++++++++++ .../test_faults_scope_isolation.test.py | 130 ++++++++++++++++++ 6 files changed, 391 insertions(+), 4 deletions(-) create mode 100644 src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py diff --git a/src/ros2_medkit_gateway/CHANGELOG.rst b/src/ros2_medkit_gateway/CHANGELOG.rst index 63861402..bcfe642f 100644 --- a/src/ros2_medkit_gateway/CHANGELOG.rst +++ b/src/ros2_medkit_gateway/CHANGELOG.rst @@ -7,6 +7,7 @@ Unreleased **Fixes:** +* ``GET /api/v1/{entity-path}/faults/{fault_code}`` and ``DELETE /api/v1/{entity-path}/faults/{fault_code}`` no longer leak faults across entities. The previous implementation relied on a prefix match against the entity's ``namespace_path`` and silently disabled the scope filter when the entity had an empty ``namespace_path`` (host-derived / synthetic components, manifest components without a ``namespace`` field, Areas, Functions), exposing - and on ``DELETE``, clearing - faults reported by apps that belonged to entirely different entities. Both handlers now resolve the addressed entity to its hosted-app FQN set and return ``404 Resource Not Found`` when the fault's ``reporting_sources`` does not intersect that set. The underlying ``GetFault.srv`` / ``ClearFault.srv`` contract is unchanged. Any client that previously received a ``200`` / ``204`` for a fault outside an entity's scope will now get a ``404`` (`#395 `_) * ``GET /api/v1/updates/{id}/status`` no longer returns ``404`` for a registered-but-idle package; ``POST /api/v1/updates`` now seeds a ``pending`` status, so the endpoint returns ``200 {"status": "pending"}`` immediately after registration. ``404`` is reserved for packages that are not registered. Clients that used ``404`` as a signal for "registered but nothing started yet" must adapt (`#378 `_) **Features:** diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp index ab019d86..96be3fc8 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -339,6 +340,30 @@ class HandlerContext { static std::vector resolve_app_host_fqns(const ThreadSafeEntityCache & cache, const std::vector & app_ids); + /** + * @brief Resolve an entity to the set of source FQNs it owns. + * + * Returns the FQNs of every ROS 2 node within the entity's scope. Used by + * fault handlers to filter `reporting_sources` so that per-entity routes + * never expose faults reported from outside the addressed entity. + * + * Mapping per entity type: + * - App: the app's `effective_fqn()` (single entry, or empty set if unbound) + * - Component: `effective_fqn()` of every hosted app via `get_apps_for_component()` + * - Area: `effective_fqn()` of every app in every component under the area + * - Function: `node_fqn` of every entry in the function's aggregation configuration + * - Unknown: empty set + * + * An empty result means "no apps are in scope" and callers must treat any + * fault as out-of-scope (i.e., return 404), never as "no filter". + * + * @param cache Entity cache for lookups + * @param entity Resolved entity info (from `validate_entity_for_route`) + * @return Set of FQNs that uniquely scopes faults to this entity + */ + static std::set resolve_entity_source_fqns(const ThreadSafeEntityCache & cache, + const EntityInfo & entity); + private: GatewayNode * node_; CorsConfig cors_config_; diff --git a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp index 4c60328a..6fa29c80 100644 --- a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp +++ b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp @@ -40,6 +40,32 @@ namespace handlers { namespace { +/// Check if a single fault's reporting_sources intersects the given prefix set. +/// Returns true when any reporting_source starts with any prefix. An empty +/// prefix set always returns false ("nothing is in scope"), so callers must +/// supply the entity's own FQN set explicitly - a fault is in scope only when +/// the entity owns at least one app that reported it. +bool fault_in_source_scope(const json & fault, const std::set & source_prefixes) { + if (source_prefixes.empty()) { + return false; + } + if (!fault.contains("reporting_sources") || !fault["reporting_sources"].is_array()) { + return false; + } + for (const auto & src : fault["reporting_sources"]) { + if (!src.is_string()) { + continue; + } + const auto src_str = src.get(); + for (const auto & prefix : source_prefixes) { + if (src_str.rfind(prefix, 0) == 0) { + return true; + } + } + } + return false; +} + /// Helper to filter faults JSON array by a set of namespace prefixes /// Keeps faults where any reporting_source starts with any of the given prefixes json filter_faults_by_sources(const json & faults_array, const std::set & source_prefixes) { @@ -672,17 +698,30 @@ void FaultHandlers::handle_get_fault(const httplib::Request & req, httplib::Resp return; } - std::string namespace_path = entity_info.namespace_path; - auto fault_mgr = ctx_.node()->get_fault_manager(); - // Use get_fault_with_env to get fault with environment data - auto result = fault_mgr->get_fault_with_env(fault_code, namespace_path); + // Fetch the fault without manager-level scope filter. The transport's + // `source_id` prefix match silently disables itself when the entity has + // an empty namespace_path (synthetic / host-derived components, manifest + // components without a `namespace` field, Areas, Functions), so we + // re-scope here against the actual FQNs the entity owns. See #395. + auto result = fault_mgr->get_fault_with_env(fault_code, ""); if (result.success) { // Build SOVD-compliant response from the transport-supplied JSON shape. // The transport handed back `result.data = { "fault": {...}, "environment_data": {...} }`. const auto & fault_json = result.data.value("fault", json::object()); + + const auto & cache = ctx_.node()->get_thread_safe_cache(); + auto source_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); + if (!fault_in_source_scope(fault_json, source_fqns)) { + HandlerContext::send_error(res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", + {{"details", "Fault is not reported by any app within this entity's scope"}, + {entity_info.id_field, entity_id}, + {"fault_code", fault_code}}); + return; + } + const auto & env_data_json = result.data.value("environment_data", json::object()); auto response = build_sovd_fault_response(fault_json, env_data_json, entity_path_info->entity_path); @@ -769,6 +808,37 @@ void FaultHandlers::handle_clear_fault(const httplib::Request & req, httplib::Re } auto fault_mgr = ctx_.node()->get_fault_manager(); + + // Verify the fault is in this entity's scope BEFORE clearing. The + // underlying ClearFault.srv has no scope argument, so without this + // check a DELETE on one entity's route would clear faults owned by + // any other entity that happens to share the fault_code. See #395. + auto get_result = fault_mgr->get_fault_with_env(fault_code, ""); + if (!get_result.success) { + if (get_result.error_message.find("not found") != std::string::npos || + get_result.error_message.find("Fault not found") != std::string::npos) { + HandlerContext::send_error( + res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", + {{"details", get_result.error_message}, {entity_info.id_field, entity_id}, {"fault_code", fault_code}}); + } else { + HandlerContext::send_error( + res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to clear fault", + {{"details", get_result.error_message}, {entity_info.id_field, entity_id}, {"fault_code", fault_code}}); + } + return; + } + + const auto & cache = ctx_.node()->get_thread_safe_cache(); + auto source_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); + const auto & fault_json = get_result.data.value("fault", json::object()); + if (!fault_in_source_scope(fault_json, source_fqns)) { + HandlerContext::send_error(res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", + {{"details", "Fault is not reported by any app within this entity's scope"}, + {entity_info.id_field, entity_id}, + {"fault_code", fault_code}}); + return; + } + auto result = fault_mgr->clear_fault(fault_code); if (result.success) { diff --git a/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp b/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp index 40508a47..301c5a37 100644 --- a/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp +++ b/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp @@ -347,6 +347,59 @@ void HandlerContext::send_json(httplib::Response & res, const json & data) { res.set_content(data.dump(2), "application/json"); } +namespace { + +void collect_app_fqn(const ThreadSafeEntityCache & cache, const std::string & app_id, std::set & out) { + auto app = cache.get_app(app_id); + if (!app) { + return; + } + auto fqn = app->effective_fqn(); + if (fqn.empty()) { + return; + } + out.insert(std::move(fqn)); +} + +} // namespace + +std::set HandlerContext::resolve_entity_source_fqns(const ThreadSafeEntityCache & cache, + const EntityInfo & entity) { + std::set fqns; + switch (entity.type) { + case EntityType::APP: { + collect_app_fqn(cache, entity.id, fqns); + break; + } + case EntityType::COMPONENT: { + for (const auto & app_id : cache.get_apps_for_component(entity.id)) { + collect_app_fqn(cache, app_id, fqns); + } + break; + } + case EntityType::AREA: { + for (const auto & comp_id : cache.get_components_for_area(entity.id)) { + for (const auto & app_id : cache.get_apps_for_component(comp_id)) { + collect_app_fqn(cache, app_id, fqns); + } + } + break; + } + case EntityType::FUNCTION: { + auto agg_configs = cache.get_entity_configurations(entity.id); + for (const auto & node : agg_configs.nodes) { + if (!node.node_fqn.empty()) { + fqns.insert(node.node_fqn); + } + } + break; + } + case EntityType::UNKNOWN: + break; + } + return fqns; +} + std::vector HandlerContext::resolve_app_host_fqns(const ThreadSafeEntityCache & cache, const std::vector & app_ids) { // Order-preserving dedup: two app_ids that resolve to the same effective_fqn diff --git a/src/ros2_medkit_gateway/test/test_handler_context.cpp b/src/ros2_medkit_gateway/test/test_handler_context.cpp index bc864269..c2221c42 100644 --- a/src/ros2_medkit_gateway/test/test_handler_context.cpp +++ b/src/ros2_medkit_gateway/test/test_handler_context.cpp @@ -1231,6 +1231,114 @@ TEST(ResolveAppHostFqnsTest, DuplicateEffectiveFqnsAreDeduplicated) { EXPECT_EQ(fqns[1], "/powertrain/engine/rpm_sensor"); } +// ============================================================================= +// resolve_entity_source_fqns tests +// +// The helper underpins #395's per-entity fault scope check. It must reject +// faults that come from outside the entity (empty result -> caller returns +// 404) AND must collect the FQNs of every hosted ROS node so that +// reporting-source matching covers all the apps the entity actually owns. +// ============================================================================= + +namespace { + +EntityInfo make_entity_info(EntityType t, const std::string & id) { + EntityInfo ei; + ei.type = t; + ei.id = id; + return ei; +} + +App make_owned_app(const std::string & app_id, const std::string & component_id, const std::string & node_name, + const std::string & ns) { + App a = make_app_with_binding(app_id, node_name, ns); + a.component_id = component_id; + return a; +} + +} // namespace + +TEST(ResolveEntitySourceFqnsTest, AppReturnsItsOwnFqn) { + ThreadSafeEntityCache cache; + cache.update_apps({make_app_with_binding("temp_sensor", "temp_sensor", "/powertrain/engine")}); + + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::APP, "temp_sensor")); + EXPECT_EQ(fqns, std::set{"/powertrain/engine/temp_sensor"}); +} + +TEST(ResolveEntitySourceFqnsTest, AppMissingFromCacheYieldsEmptySet) { + ThreadSafeEntityCache cache; + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::APP, "gone")); + EXPECT_TRUE(fqns.empty()); +} + +TEST(ResolveEntitySourceFqnsTest, ComponentAggregatesHostedAppFqns) { + ThreadSafeEntityCache cache; + cache.update_apps({ + make_owned_app("temp-sensor", "temp-hw", "temp_sensor", "/powertrain/engine"), + make_owned_app("rpm-sensor", "rpm-hw", "rpm_sensor", "/powertrain/engine"), + make_owned_app("lidar-sensor", "lidar-unit", "lidar_sensor", "/perception/lidar"), + }); + + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::COMPONENT, "temp-hw")); + // The lidar app must not appear: the bug under #395 was that an empty + // namespace_path on the addressed component silently disabled the scope + // filter and exposed faults from unrelated apps. + EXPECT_EQ(fqns, std::set{"/powertrain/engine/temp_sensor"}); +} + +TEST(ResolveEntitySourceFqnsTest, ComponentWithoutHostedAppsReturnsEmptySet) { + ThreadSafeEntityCache cache; + cache.update_apps({make_owned_app("temp-sensor", "temp-hw", "temp_sensor", "/powertrain/engine")}); + + // Querying a different component yields an empty set, NOT "no filter". + // Callers (fault handlers) treat empty as out-of-scope -> 404. + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::COMPONENT, "lidar-unit")); + EXPECT_TRUE(fqns.empty()); +} + +TEST(ResolveEntitySourceFqnsTest, AreaCollectsAppsFromAllComponentsInArea) { + ThreadSafeEntityCache cache; + Component comp_a; + comp_a.id = "temp-hw"; + comp_a.area = "engine"; + Component comp_b; + comp_b.id = "rpm-hw"; + comp_b.area = "engine"; + Component comp_off_area; + comp_off_area.id = "lidar-unit"; + comp_off_area.area = "perception"; + cache.update_components({comp_a, comp_b, comp_off_area}); + cache.update_apps({ + make_owned_app("temp-sensor", "temp-hw", "temp_sensor", "/powertrain/engine"), + make_owned_app("rpm-sensor", "rpm-hw", "rpm_sensor", "/powertrain/engine"), + make_owned_app("lidar-sensor", "lidar-unit", "lidar_sensor", "/perception/lidar"), + }); + + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::AREA, "engine")); + std::set expected{"/powertrain/engine/temp_sensor", "/powertrain/engine/rpm_sensor"}; + EXPECT_EQ(fqns, expected); +} + +TEST(ResolveEntitySourceFqnsTest, UnknownEntityTypeReturnsEmptySet) { + ThreadSafeEntityCache cache; + cache.update_apps({make_app_with_binding("temp_sensor", "temp_sensor", "/powertrain/engine")}); + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::UNKNOWN, "anything")); + EXPECT_TRUE(fqns.empty()); +} + +TEST(ResolveEntitySourceFqnsTest, AppsWithEmptyEffectiveFqnAreSkipped) { + ThreadSafeEntityCache cache; + App no_binding; + no_binding.id = "no_binding"; + no_binding.component_id = "comp-x"; + App ok = make_owned_app("ok", "comp-x", "ok_node", "/ns"); + cache.update_apps({no_binding, ok}); + + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::COMPONENT, "comp-x")); + EXPECT_EQ(fqns, std::set{"/ns/ok_node"}); +} + int main(int argc, char ** argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py b/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py new file mode 100644 index 00000000..bf4210f5 --- /dev/null +++ b/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# Copyright 2026 bburda +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Regression tests for cross-entity fault scope leak (#395). + +Per-entity fault routes (`GET` / `DELETE +/components/{id}/faults/{fault_code}`) must filter by the apps the entity +owns. Without this, components whose `namespace_path` is empty (synthetic, +host-derived, or manifest components without an explicit ``namespace`` +field) would short-circuit the transport-level prefix filter and expose +faults reported by apps that belong to entirely different components. + +These tests use the demo manifest in hybrid mode. Manifest components +in that file declare ``area`` but not ``namespace``, so every component +sees the bug pre-fix. +""" + +import os +import unittest + +from ament_index_python.packages import get_package_share_directory +import launch_testing +import launch_testing.actions +import requests + +from ros2_medkit_test_utils.constants import ALLOWED_EXIT_CODES +from ros2_medkit_test_utils.gateway_test_case import GatewayTestCase +from ros2_medkit_test_utils.launch_helpers import create_test_launch + + +LIDAR_FAULT_CODE = 'LIDAR_RANGE_INVALID' +LIDAR_OWNER_COMPONENT = 'lidar-unit' +OTHER_COMPONENT = 'temp-sensor-hw' + + +def generate_test_description(): + pkg_share = get_package_share_directory('ros2_medkit_gateway') + manifest_path = os.path.join( + pkg_share, 'config', 'examples', 'demo_nodes_manifest.yaml' + ) + return create_test_launch( + demo_nodes=['lidar_sensor', 'temp_sensor'], + fault_manager=True, + lidar_faulty=True, + fault_manager_params={'confirmation_threshold': -2}, + gateway_params={ + 'discovery.mode': 'hybrid', + 'discovery.manifest_path': manifest_path, + 'discovery.manifest_strict_validation': False, + 'discovery.merge_pipeline.gap_fill.allow_heuristic_apps': True, + }, + ) + + +class TestFaultsScopeIsolation(GatewayTestCase): + """Per-entity fault routes must not leak faults across components.""" + + MIN_EXPECTED_APPS = 2 + REQUIRED_APPS = {'lidar-sensor'} + + def setUp(self): + super().setUp() + # The lidar app reports LIDAR_RANGE_INVALID deterministically when + # launched with faulty params. Wait until the gateway sees it on the + # owning app before exercising the per-component routes. + self.wait_for_fault('/apps/lidar-sensor', LIDAR_FAULT_CODE) + + def test_get_fault_returns_404_on_unrelated_component(self): + """GET on a different component must not return another entity's fault. + + @verifies REQ_INTEROP_013 + """ + response = requests.get( + f'{self.BASE_URL}/components/{OTHER_COMPONENT}/faults/{LIDAR_FAULT_CODE}', + timeout=10, + ) + self.assertEqual( + response.status_code, 404, + f'Expected 404, got {response.status_code} body={response.text}', + ) + + def test_get_fault_returns_200_on_owning_component(self): + """GET on the component that hosts the reporting app must return the fault.""" + data = self.get_json( + f'/components/{LIDAR_OWNER_COMPONENT}/faults/{LIDAR_FAULT_CODE}' + ) + self.assertEqual(data.get('item', {}).get('code'), LIDAR_FAULT_CODE) + + def test_clear_fault_returns_404_on_unrelated_component(self): + """DELETE on a different component must not clear another entity's fault.""" + response = requests.delete( + f'{self.BASE_URL}/components/{OTHER_COMPONENT}/faults/{LIDAR_FAULT_CODE}', + timeout=10, + ) + self.assertEqual( + response.status_code, 404, + f'Expected 404, got {response.status_code} body={response.text}', + ) + + # Verify the fault is still present on the owning app afterwards. + listing = self.get_json('/apps/lidar-sensor/faults') + codes = {item.get('fault_code') for item in listing.get('items', [])} + self.assertIn( + LIDAR_FAULT_CODE, codes, + 'Unrelated DELETE must not clear faults outside its scope', + ) + + +@launch_testing.post_shutdown_test() +class TestShutdown(unittest.TestCase): + + def test_exit_codes(self, proc_info): + """Check all processes exited cleanly (SIGTERM allowed).""" + for info in proc_info: + self.assertIn( + info.returncode, ALLOWED_EXIT_CODES, + f'{info.process_name} exited with code {info.returncode}' + ) From cd8335f31302f525ad1b8d8b480710bdee350bdc Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Thu, 14 May 2026 13:42:55 +0200 Subject: [PATCH 2/6] fix(gateway,#395): scope app-level list/clear-all by FQN set too The first pass on #395 only fixed the per-fault GET/DELETE routes. The collection routes (handle_list_faults, handle_clear_all_faults) still passed entity_info.namespace_path as the transport-level prefix filter for App entities. Apps with a wildcard ros_binding.namespace_pattern yield an empty effective_fqn, the transport silently disabled the filter, and the list/clear-all routes leaked - or cleared - every fault in the system. Both App branches now route through resolve_entity_source_fqns plus filter_faults_by_sources, matching how Component / Area / Function are already handled. Also: - Move the #395 CHANGELOG entry to Breaking Changes (the 200 -> 404 flip is observable to clients). - Document 404 on GET /{entity}/faults/{fault_code} in docs/api/rest.rst. - Add @verifies REQ_INTEROP_013 / REQ_INTEROP_015 tags to the scope-isolation tests so the generated verification matrix picks them up. - Add a DELETE happy-path test that clears the fault on the owning component and asserts 204. - Add unit coverage for resolve_entity_source_fqns on FUNCTION entities (aggregation + skip-unbound-hosts cases). - Note the wildcard-App limitation in the helper doxygen. - Drop unused launch_testing.actions import in the integration test. --- docs/api/rest.rst | 8 +++- src/ros2_medkit_gateway/CHANGELOG.rst | 4 +- .../http/handlers/handler_context.hpp | 8 +++- .../src/http/handlers/fault_handlers.cpp | 33 ++++++++++++----- .../test/test_handler_context.cpp | 37 +++++++++++++++++++ .../test_faults_scope_isolation.test.py | 31 ++++++++++++++-- 6 files changed, 103 insertions(+), 18 deletions(-) diff --git a/docs/api/rest.rst b/docs/api/rest.rst index cab8e8d1..044a77f3 100644 --- a/docs/api/rest.rst +++ b/docs/api/rest.rst @@ -750,11 +750,17 @@ Query and manage faults. - ``freeze_frame``: Topic data captured at fault confirmation - ``rosbag``: Recording file available via bulk-data endpoint + **Response codes:** + + - **200:** Fault details + - **404:** Fault not found, or reported by an app outside this entity's scope + - **503:** Fault manager unavailable + ``DELETE /api/v1/components/{id}/faults/{fault_code}`` Clear a fault. - **204:** Fault cleared - - **404:** Fault not found + - **404:** Fault not found, or reported by an app outside this entity's scope ``DELETE /api/v1/components/{id}/faults`` Clear all faults for an entity. diff --git a/src/ros2_medkit_gateway/CHANGELOG.rst b/src/ros2_medkit_gateway/CHANGELOG.rst index bcfe642f..4657166c 100644 --- a/src/ros2_medkit_gateway/CHANGELOG.rst +++ b/src/ros2_medkit_gateway/CHANGELOG.rst @@ -5,9 +5,9 @@ Changelog for package ros2_medkit_gateway Unreleased ---------- -**Fixes:** +**Breaking Changes:** -* ``GET /api/v1/{entity-path}/faults/{fault_code}`` and ``DELETE /api/v1/{entity-path}/faults/{fault_code}`` no longer leak faults across entities. The previous implementation relied on a prefix match against the entity's ``namespace_path`` and silently disabled the scope filter when the entity had an empty ``namespace_path`` (host-derived / synthetic components, manifest components without a ``namespace`` field, Areas, Functions), exposing - and on ``DELETE``, clearing - faults reported by apps that belonged to entirely different entities. Both handlers now resolve the addressed entity to its hosted-app FQN set and return ``404 Resource Not Found`` when the fault's ``reporting_sources`` does not intersect that set. The underlying ``GetFault.srv`` / ``ClearFault.srv`` contract is unchanged. Any client that previously received a ``200`` / ``204`` for a fault outside an entity's scope will now get a ``404`` (`#395 `_) +* Per-entity fault routes are now correctly scoped to the entity's hosted apps. ``GET /api/v1/{entity-path}/faults/{fault_code}``, ``DELETE /api/v1/{entity-path}/faults/{fault_code}``, ``GET /api/v1/{entity-path}/faults``, and ``DELETE /api/v1/{entity-path}/faults`` previously fell back to a prefix match against the entity's ``namespace_path``; when that was empty (host-derived / synthetic components, manifest components without a ``namespace`` field, Areas, Functions, and Apps with a wildcard ``ros_binding.namespace_pattern``) the scope filter was silently disabled and the routes exposed - and on ``DELETE``, cleared - faults reported by apps that belonged to entirely different entities. All four handlers now resolve the addressed entity to its hosted-app FQN set (via the new ``HandlerContext::resolve_entity_source_fqns`` helper) and apply that set as a positive scope filter. Per-fault routes return ``404 Resource Not Found`` when the fault's ``reporting_sources`` does not intersect the set; collection routes return an empty ``items`` array. The underlying ``GetFault.srv`` / ``ClearFault.srv`` contracts are unchanged. Any client that previously received a ``200`` / ``204`` (single fault) or a populated ``items`` array for faults outside an entity's scope will now get ``404`` or an empty list (`#395 `_) * ``GET /api/v1/updates/{id}/status`` no longer returns ``404`` for a registered-but-idle package; ``POST /api/v1/updates`` now seeds a ``pending`` status, so the endpoint returns ``200 {"status": "pending"}`` immediately after registration. ``404`` is reserved for packages that are not registered. Clients that used ``404`` as a signal for "registered but nothing started yet" must adapt (`#378 `_) **Features:** diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp index 96be3fc8..7b863842 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp @@ -348,10 +348,14 @@ class HandlerContext { * never expose faults reported from outside the addressed entity. * * Mapping per entity type: - * - App: the app's `effective_fqn()` (single entry, or empty set if unbound) + * - App: the app's `effective_fqn()` (single entry, or empty set if unbound + * or if `ros_binding.namespace_pattern` is a wildcard - by design + * `effective_fqn()` returns "" for those, so the scope filter treats them + * as having no addressable node and any fault read becomes a 404) * - Component: `effective_fqn()` of every hosted app via `get_apps_for_component()` * - Area: `effective_fqn()` of every app in every component under the area - * - Function: `node_fqn` of every entry in the function's aggregation configuration + * - Function: non-empty `node_fqn` of every entry in the function's aggregation + * configuration (entries with empty `node_fqn` are skipped) * - Unknown: empty set * * An empty result means "no apps are in scope" and callers must treat any diff --git a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp index 6fa29c80..8233874a 100644 --- a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp +++ b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp @@ -595,20 +595,27 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re return; } - // For Apps, use namespace_path filtering - std::string namespace_path = entity_info.namespace_path; - auto result = - fault_mgr->list_faults(namespace_path, filter.include_pending, filter.include_confirmed, filter.include_cleared, - filter.include_healed, include_muted, include_clusters); + // For Apps, scope by the app's effective FQN set. We can't reuse the + // transport-level prefix filter (`list_faults(namespace_path, ...)`) + // because an App with a wildcard `ros_binding.namespace_pattern` produces + // an empty effective_fqn, the transport silently disables the filter, and + // every fault in the system would be returned. Same bug class as #395 for + // Components - fix it the same way. + const auto & cache = ctx_.node()->get_thread_safe_cache(); + auto app_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); + auto result = fault_mgr->list_faults("", filter.include_pending, filter.include_confirmed, filter.include_cleared, + filter.include_healed, include_muted, include_clusters); if (result.success) { + json filtered_faults = filter_faults_by_sources(result.data["faults"], app_fqns); + // Format: items array at top level - json response = {{"items", result.data["faults"]}}; + json response = {{"items", filtered_faults}}; // x-medkit extension for ros2_medkit-specific fields XMedkit ext; ext.entity_id(entity_id); - ext.add("source_id", namespace_path); + ext.add("source_id", entity_info.namespace_path); // Include detailed correlation data if requested and present if (result.data.contains("muted_faults")) { @@ -1006,14 +1013,20 @@ void FaultHandlers::handle_clear_all_faults(const httplib::Request & req, httpli faults_to_clear = filter_faults_by_sources(result.data["faults"], app_fqns); } else { - // Apps: use namespace_path filtering directly - auto result = fault_mgr->list_faults(entity_info.namespace_path); + // Apps: scope by the app's effective FQN. Cannot use the transport-level + // prefix filter (`list_faults(namespace_path)`) directly because Apps + // with a wildcard `ros_binding.namespace_pattern` produce an empty + // effective_fqn, the transport silently disables filtering, and we + // would clear every fault in the system. Same bug class as #395. + auto result = fault_mgr->list_faults(""); if (!result.success) { HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to retrieve faults", {{"details", result.error_message}, {entity_info.id_field, entity_id}}); return; } - faults_to_clear = result.data["faults"]; + const auto & cache = ctx_.node()->get_thread_safe_cache(); + auto app_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); + faults_to_clear = filter_faults_by_sources(result.data["faults"], app_fqns); } // Clear each matching fault diff --git a/src/ros2_medkit_gateway/test/test_handler_context.cpp b/src/ros2_medkit_gateway/test/test_handler_context.cpp index c2221c42..1c64037c 100644 --- a/src/ros2_medkit_gateway/test/test_handler_context.cpp +++ b/src/ros2_medkit_gateway/test/test_handler_context.cpp @@ -1339,6 +1339,43 @@ TEST(ResolveEntitySourceFqnsTest, AppsWithEmptyEffectiveFqnAreSkipped) { EXPECT_EQ(fqns, std::set{"/ns/ok_node"}); } +TEST(ResolveEntitySourceFqnsTest, FunctionAggregatesHostedAppFqns) { + ThreadSafeEntityCache cache; + cache.update_apps({ + make_app_with_binding("nav-planner", "planner", "/perception/nav"), + make_app_with_binding("nav-localizer", "localizer", "/perception/nav"), + make_app_with_binding("unrelated", "telemetry", "/telemetry"), + }); + Function autonomy; + autonomy.id = "autonomous-navigation"; + autonomy.name = "Autonomous Navigation"; + autonomy.hosts = {"nav-planner", "nav-localizer"}; + cache.update_functions({autonomy}); + + auto fqns = HandlerContext::resolve_entity_source_fqns( + cache, make_entity_info(EntityType::FUNCTION, "autonomous-navigation")); + std::set expected{"/perception/nav/planner", "/perception/nav/localizer"}; + EXPECT_EQ(fqns, expected); +} + +TEST(ResolveEntitySourceFqnsTest, FunctionWithUnboundHostsReturnsOnlyResolvedFqns) { + // Function.hosts referencing an app that does not produce an effective_fqn + // (no bound_fqn and no ros_binding) must be skipped, not return empty FQNs + // - they would prefix-match everything in `fault_in_source_scope` and the + // scope check would let any fault through. + ThreadSafeEntityCache cache; + App unbound; + unbound.id = "unbound"; + cache.update_apps({unbound, make_app_with_binding("planner", "planner", "/perception/nav")}); + Function f; + f.id = "func-x"; + f.hosts = {"unbound", "planner"}; + cache.update_functions({f}); + + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::FUNCTION, "func-x")); + EXPECT_EQ(fqns, std::set{"/perception/nav/planner"}); +} + int main(int argc, char ** argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py b/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py index bf4210f5..170dc2a3 100644 --- a/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py +++ b/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py @@ -32,7 +32,6 @@ from ament_index_python.packages import get_package_share_directory import launch_testing -import launch_testing.actions import requests from ros2_medkit_test_utils.constants import ALLOWED_EXIT_CODES @@ -92,14 +91,20 @@ def test_get_fault_returns_404_on_unrelated_component(self): ) def test_get_fault_returns_200_on_owning_component(self): - """GET on the component that hosts the reporting app must return the fault.""" + """GET on the component that hosts the reporting app must return the fault. + + @verifies REQ_INTEROP_013 + """ data = self.get_json( f'/components/{LIDAR_OWNER_COMPONENT}/faults/{LIDAR_FAULT_CODE}' ) self.assertEqual(data.get('item', {}).get('code'), LIDAR_FAULT_CODE) def test_clear_fault_returns_404_on_unrelated_component(self): - """DELETE on a different component must not clear another entity's fault.""" + """DELETE on a different component must not clear another entity's fault. + + @verifies REQ_INTEROP_015 + """ response = requests.delete( f'{self.BASE_URL}/components/{OTHER_COMPONENT}/faults/{LIDAR_FAULT_CODE}', timeout=10, @@ -117,6 +122,26 @@ def test_clear_fault_returns_404_on_unrelated_component(self): 'Unrelated DELETE must not clear faults outside its scope', ) + def test_zzz_clear_fault_returns_204_on_owning_component(self): + """DELETE on the owning component clears the fault. + + Named `_zzz_` so unittest's alphabetic ordering runs this last - it is + destructive (clears the lidar fault from the global FaultManager). The + lidar_sensor demo re-reports LIDAR_RANGE_INVALID continuously, so this + does not break other test runs, but in-suite siblings rely on the fault + being present via the setUp `wait_for_fault` poll. + + @verifies REQ_INTEROP_015 + """ + response = requests.delete( + f'{self.BASE_URL}/components/{LIDAR_OWNER_COMPONENT}/faults/{LIDAR_FAULT_CODE}', + timeout=10, + ) + self.assertEqual( + response.status_code, 204, + f'Expected 204, got {response.status_code} body={response.text}', + ) + @launch_testing.post_shutdown_test() class TestShutdown(unittest.TestCase): From 39e43cc05db5a41d2baa7011e8d1cbaab92bb4a6 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Thu, 14 May 2026 14:34:38 +0200 Subject: [PATCH 3/6] fix(gateway,#395): tighter scope check, area recursion, function components Address the eight review findings from the post-push self-review pass: - Source matching now enforces a path boundary: a node is in scope iff it equals one of the entity's owned FQNs or is a strict path-child (`/<...>`). Pre-fix, `rfind(prefix, 0) == 0` let sibling nodes like `/ns/node_extra` pass for an entity owning `/ns/node`. - `fault_in_source_scope` now requires ALL reporting sources to be in scope. The old "any match" semantic exposed two cross-entity paths: the GET response carries the fault's full `reporting_sources` and environment data; the DELETE issues an unscoped `ClearFault.srv` that clears the aggregated record across every source for the code. `filter_faults_by_sources` inherits the same semantic so per-entity collection routes and per-fault routes agree on what counts as in scope. - `resolve_entity_source_fqns` AREA branch walks `get_subareas()` recursively. The manifest model nests areas (e.g. `engine` under `powertrain`) and components attach to subareas, so a top-level area query previously returned an empty set. - `resolve_entity_source_fqns` FUNCTION branch handles host IDs that are components (Function.hosts is documented as "App or Component IDs"), expanding component hosts to their apps. The cache's `function_to_apps_` index only resolves app hosts. - `fault_in_source_scope` is now a public static method on FaultHandlers so it can be unit-tested directly without a launch test. Added test_fault_handlers cases for exact match, path-child match, prefix collision, multi-source all-in-scope, multi-source partial, empty-scope, missing field, malformed entry, and root-FQN edge. - Added test_handler_context cases for nested-subarea recursion and function-hosting-component expansion. - Integration test setUp now verifies both the owning and the unrelated component were actually discovered before asserting 404. Without it, a 404 from an undiscovered entity is indistinguishable from the 404 the scope filter produces, so the regression would silently pass even with the fix reverted. - Doxygen for `resolve_entity_source_fqns` reworded to be generic about what callers do with an empty result (per-fault routes 404, list 200 with empty `items`, clear-all 204). --- .../http/handlers/fault_handlers.hpp | 29 ++++++ .../http/handlers/handler_context.hpp | 28 ++++-- .../src/http/handlers/fault_handlers.cpp | 86 +++++++++--------- .../src/http/handlers/handler_context.cpp | 78 +++++++++++----- .../test/test_fault_handlers.cpp | 89 +++++++++++++++++++ .../test/test_handler_context.cpp | 62 +++++++++++++ .../test_faults_scope_isolation.test.py | 14 +++ 7 files changed, 312 insertions(+), 74 deletions(-) diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/fault_handlers.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/fault_handlers.hpp index 014fdf0f..87bf9d5b 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/fault_handlers.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/fault_handlers.hpp @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include "ros2_medkit_gateway/http/handlers/handler_context.hpp" @@ -105,6 +106,34 @@ class FaultHandlers { const nlohmann::json & env_data_json, const std::string & entity_path); + /** + * @brief Scope check used by per-entity fault routes. + * + * Returns true iff every entry in `fault["reporting_sources"]` is in scope + * for `source_fqns`. A source matches a scope FQN when it equals the FQN + * or is a strict path-child (i.e. `/<...>`), so similarly named nodes + * like `/ns/node` and `/ns/node_extra` are not conflated. + * + * The "all sources must match" semantic (rather than "any source") is + * deliberate: it blocks two cross-entity escalation paths. + * + * 1. GET would otherwise return a response whose `reporting_sources` and + * environment data carry identities of nodes the caller has no + * business reading. + * 2. DELETE would otherwise let a viewer of entity A clear the aggregated + * fault record for a fault that entity B also reports, because the + * underlying `ClearFault.srv` has no scope argument. + * + * An empty scope set, an empty `reporting_sources` array, a missing + * `reporting_sources` field, or any non-string source entry all return + * false - there is no vacuous "all match" case. + * + * Public for direct unit testing; called by `handle_get_fault`, + * `handle_clear_fault`, and indirectly via `filter_faults_by_sources` by + * the collection routes. + */ + static bool fault_in_source_scope(const nlohmann::json & fault, const std::set & source_fqns); + private: HandlerContext & ctx_; }; diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp index 7b863842..c150a3e9 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp @@ -350,16 +350,26 @@ class HandlerContext { * Mapping per entity type: * - App: the app's `effective_fqn()` (single entry, or empty set if unbound * or if `ros_binding.namespace_pattern` is a wildcard - by design - * `effective_fqn()` returns "" for those, so the scope filter treats them - * as having no addressable node and any fault read becomes a 404) - * - Component: `effective_fqn()` of every hosted app via `get_apps_for_component()` - * - Area: `effective_fqn()` of every app in every component under the area - * - Function: non-empty `node_fqn` of every entry in the function's aggregation - * configuration (entries with empty `node_fqn` are skipped) - * - Unknown: empty set + * `effective_fqn()` returns "" for those, so the entity simply has no + * addressable ROS node and the scope check excludes every fault). + * - Component: `effective_fqn()` of every hosted app via + * `get_apps_for_component()`. + * - Area: `effective_fqn()` of every app under the area, walking + * `get_subareas()` recursively so nested areas (e.g. ``powertrain -> + * engine``) resolve to the union of their descendants' apps. + * - Function: `effective_fqn()` of every app the function hosts directly + * plus, for hosts that are component IDs, the apps inside those + * components. + * - Unknown: empty set. * - * An empty result means "no apps are in scope" and callers must treat any - * fault as out-of-scope (i.e., return 404), never as "no filter". + * Apps whose `effective_fqn()` is empty are silently dropped from the set + * so the scope check cannot match arbitrary FQNs against an empty prefix. + * + * An empty result means "no apps are in scope" and callers must NEVER + * interpret that as "no filter" - any fault must be treated as out of + * scope. The exact response (404 for per-fault routes, an empty `items` + * array for collection lists, 204 for collection clears) is up to the + * caller. * * @param cache Entity cache for lookups * @param entity Resolved entity info (from `validate_entity_for_route`) diff --git a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp index 8233874a..6d1273b1 100644 --- a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp +++ b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp @@ -40,55 +40,33 @@ namespace handlers { namespace { -/// Check if a single fault's reporting_sources intersects the given prefix set. -/// Returns true when any reporting_source starts with any prefix. An empty -/// prefix set always returns false ("nothing is in scope"), so callers must -/// supply the entity's own FQN set explicitly - a fault is in scope only when -/// the entity owns at least one app that reported it. -bool fault_in_source_scope(const json & fault, const std::set & source_prefixes) { - if (source_prefixes.empty()) { - return false; - } - if (!fault.contains("reporting_sources") || !fault["reporting_sources"].is_array()) { - return false; - } - for (const auto & src : fault["reporting_sources"]) { - if (!src.is_string()) { - continue; +/// Check if a ROS node FQN falls within the entity's source FQN set. +/// +/// A node is in scope iff it equals one of the entity's owned FQNs, OR is a +/// strict path-child of one (i.e., `/<...>`). We deliberately do +/// NOT use raw `rfind(prefix, 0)` because that would let `/ns/node_extra` +/// pass for an entity owning `/ns/node`. Path boundary is enforced by +/// requiring the byte after the prefix to be `/`. +bool source_matches_scope(const std::string & src, const std::set & scope_fqns) { + for (const auto & fqn : scope_fqns) { + if (src == fqn) { + return true; } - const auto src_str = src.get(); - for (const auto & prefix : source_prefixes) { - if (src_str.rfind(prefix, 0) == 0) { - return true; - } + if (src.size() > fqn.size() && src.compare(0, fqn.size(), fqn) == 0 && src[fqn.size()] == '/') { + return true; } } return false; } -/// Helper to filter faults JSON array by a set of namespace prefixes -/// Keeps faults where any reporting_source starts with any of the given prefixes -json filter_faults_by_sources(const json & faults_array, const std::set & source_prefixes) { +/// Filter a faults JSON array down to faults whose every reporting source is +/// in the entity's scope. Shares the same all-sources / path-boundary +/// semantics as `FaultHandlers::fault_in_source_scope` so per-entity +/// collection routes and per-fault routes agree on what counts as "in scope". +json filter_faults_by_sources(const json & faults_array, const std::set & source_fqns) { json filtered = json::array(); for (const auto & fault : faults_array) { - if (!fault.contains("reporting_sources")) { - continue; - } - const auto & sources = fault["reporting_sources"]; - bool matches = false; - for (const auto & src : sources) { - const std::string src_str = src.get(); - for (const auto & prefix : source_prefixes) { - if (src_str.rfind(prefix, 0) == 0) { - matches = true; - break; - } - } - if (matches) { - break; - } - } - if (matches) { + if (FaultHandlers::fault_in_source_scope(fault, source_fqns)) { filtered.push_back(fault); } } @@ -192,6 +170,28 @@ json extract_primary_value(const std::string & message_type, const json & full_d } // namespace +bool FaultHandlers::fault_in_source_scope(const json & fault, const std::set & source_fqns) { + if (source_fqns.empty()) { + return false; + } + if (!fault.contains("reporting_sources") || !fault["reporting_sources"].is_array()) { + return false; + } + const auto & sources = fault["reporting_sources"]; + if (sources.empty()) { + return false; + } + for (const auto & src : sources) { + if (!src.is_string()) { + return false; + } + if (!source_matches_scope(src.get(), source_fqns)) { + return false; + } + } + return true; +} + // Static method: Build SOVD-compliant fault response from transport-supplied JSON. // // The transport adapter performs ros2_medkit_msgs -> JSON translation; the @@ -721,7 +721,7 @@ void FaultHandlers::handle_get_fault(const httplib::Request & req, httplib::Resp const auto & cache = ctx_.node()->get_thread_safe_cache(); auto source_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); - if (!fault_in_source_scope(fault_json, source_fqns)) { + if (!FaultHandlers::fault_in_source_scope(fault_json, source_fqns)) { HandlerContext::send_error(res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", {{"details", "Fault is not reported by any app within this entity's scope"}, {entity_info.id_field, entity_id}, @@ -838,7 +838,7 @@ void FaultHandlers::handle_clear_fault(const httplib::Request & req, httplib::Re const auto & cache = ctx_.node()->get_thread_safe_cache(); auto source_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); const auto & fault_json = get_result.data.value("fault", json::object()); - if (!fault_in_source_scope(fault_json, source_fqns)) { + if (!FaultHandlers::fault_in_source_scope(fault_json, source_fqns)) { HandlerContext::send_error(res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", {{"details", "Fault is not reported by any app within this entity's scope"}, {entity_info.id_field, entity_id}, diff --git a/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp b/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp index 301c5a37..8a4562dc 100644 --- a/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp +++ b/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp @@ -361,39 +361,73 @@ void collect_app_fqn(const ThreadSafeEntityCache & cache, const std::string & ap out.insert(std::move(fqn)); } +void collect_component_app_fqns(const ThreadSafeEntityCache & cache, const std::string & comp_id, + std::set & out) { + for (const auto & app_id : cache.get_apps_for_component(comp_id)) { + collect_app_fqn(cache, app_id, out); + } +} + +void collect_area_app_fqns(const ThreadSafeEntityCache & cache, const std::string & area_id, + std::set & out) { + // BFS over (area, subareas...) so a top-level area whose components live in + // nested subareas still resolves to the union of every descendant's apps. + // Without the recursion, e.g. `/areas/powertrain/...` returns an empty set + // when components are attached to `engine` (subarea of `powertrain`). + std::vector pending = {area_id}; + std::set visited; + while (!pending.empty()) { + auto current = std::move(pending.back()); + pending.pop_back(); + if (!visited.insert(current).second) { + continue; + } + for (const auto & comp_id : cache.get_components_for_area(current)) { + collect_component_app_fqns(cache, comp_id, out); + } + for (const auto & sub_id : cache.get_subareas(current)) { + pending.push_back(sub_id); + } + } +} + +void collect_function_app_fqns(const ThreadSafeEntityCache & cache, const std::string & function_id, + std::set & out) { + // Function.hosts can contain either App IDs or Component IDs; the indexed + // lookups in the cache only resolve the App-host case (function_to_apps_), + // so we walk the raw `hosts` list and dispatch per host kind ourselves. + auto func = cache.get_function(function_id); + if (!func) { + return; + } + for (const auto & host_id : func->hosts) { + if (cache.get_app(host_id)) { + collect_app_fqn(cache, host_id, out); + } else if (cache.get_component(host_id)) { + collect_component_app_fqns(cache, host_id, out); + } + // Unknown host - silently skip; it would have been flagged by manifest validation. + } +} + } // namespace std::set HandlerContext::resolve_entity_source_fqns(const ThreadSafeEntityCache & cache, const EntityInfo & entity) { std::set fqns; switch (entity.type) { - case EntityType::APP: { + case EntityType::APP: collect_app_fqn(cache, entity.id, fqns); break; - } - case EntityType::COMPONENT: { - for (const auto & app_id : cache.get_apps_for_component(entity.id)) { - collect_app_fqn(cache, app_id, fqns); - } + case EntityType::COMPONENT: + collect_component_app_fqns(cache, entity.id, fqns); break; - } - case EntityType::AREA: { - for (const auto & comp_id : cache.get_components_for_area(entity.id)) { - for (const auto & app_id : cache.get_apps_for_component(comp_id)) { - collect_app_fqn(cache, app_id, fqns); - } - } + case EntityType::AREA: + collect_area_app_fqns(cache, entity.id, fqns); break; - } - case EntityType::FUNCTION: { - auto agg_configs = cache.get_entity_configurations(entity.id); - for (const auto & node : agg_configs.nodes) { - if (!node.node_fqn.empty()) { - fqns.insert(node.node_fqn); - } - } + case EntityType::FUNCTION: + collect_function_app_fqns(cache, entity.id, fqns); break; - } case EntityType::UNKNOWN: break; } diff --git a/src/ros2_medkit_gateway/test/test_fault_handlers.cpp b/src/ros2_medkit_gateway/test/test_fault_handlers.cpp index c0b5a5e0..450f0eb3 100644 --- a/src/ros2_medkit_gateway/test/test_fault_handlers.cpp +++ b/src/ros2_medkit_gateway/test/test_fault_handlers.cpp @@ -387,3 +387,92 @@ TEST_F(FaultHandlersTest, BuildSovdFaultResponseMixedSnapshots) { EXPECT_EQ(snap1["type"], "rosbag"); EXPECT_EQ(snap1["bulk_data_uri"], "/components/motor/bulk-data/rosbags/MIXED_FAULT"); } + +// ============================================================================= +// fault_in_source_scope tests (#395) +// +// Direct coverage for the scope-check logic that backs per-entity GET/DELETE +// and the collection-route filter. Integration tests pin the HTTP behavior; +// these tests pin the boundary semantics that decide what counts as "in +// scope" - especially the cases that motivated the post-review tightening: +// prefix-colliding FQNs and multi-source faults. +// ============================================================================= + +namespace { + +json make_fault(std::vector reporting_sources, const std::string & code = "F1") { + json f; + f["fault_code"] = code; + f["reporting_sources"] = std::move(reporting_sources); + return f; +} + +} // namespace + +TEST(FaultInSourceScopeTest, SingleSourceExactMatchInScope) { + EXPECT_TRUE(FaultHandlers::fault_in_source_scope(make_fault({"/perception/lidar/lidar_sensor"}), + {"/perception/lidar/lidar_sensor"})); +} + +TEST(FaultInSourceScopeTest, SubpathOfScopedFqnIsInScope) { + // A node nested under the entity's own FQN must still match - e.g. an app + // FQN `/perception/lidar/lidar_sensor` and a reporter at + // `/perception/lidar/lidar_sensor/diagnostic_updater` should be the same + // app's sub-node, not a stranger. + EXPECT_TRUE(FaultHandlers::fault_in_source_scope(make_fault({"/perception/lidar/lidar_sensor/diagnostic_updater"}), + {"/perception/lidar/lidar_sensor"})); +} + +TEST(FaultInSourceScopeTest, PrefixCollidingNameIsNotInScope) { + // `/ns/node_extra` shares the `/ns/node` prefix but is a distinct ROS node. + // The pre-review check used `rfind(prefix, 0) == 0` and silently let it + // through - this test pins the path-boundary fix. + EXPECT_FALSE(FaultHandlers::fault_in_source_scope(make_fault({"/ns/node_extra"}), {"/ns/node"})); +} + +TEST(FaultInSourceScopeTest, MultiSourceAllInScopeIsInScope) { + std::set scope{"/perception/lidar/lidar_sensor", "/perception/rgb/rgb_camera"}; + EXPECT_TRUE(FaultHandlers::fault_in_source_scope( + make_fault({"/perception/lidar/lidar_sensor", "/perception/rgb/rgb_camera"}), scope)); +} + +TEST(FaultInSourceScopeTest, MultiSourcePartiallyInScopeIsOutOfScope) { + // The all-sources semantic blocks a cross-entity DELETE escalation: an + // entity that owns only `/perception/lidar/lidar_sensor` must not be able + // to clear (or read in detail) a fault that another entity in + // `/telemetry/...` also reported. + std::set scope{"/perception/lidar/lidar_sensor"}; + EXPECT_FALSE(FaultHandlers::fault_in_source_scope( + make_fault({"/perception/lidar/lidar_sensor", "/telemetry/telemetry_node"}), scope)); +} + +TEST(FaultInSourceScopeTest, EmptyScopeSetIsOutOfScope) { + EXPECT_FALSE(FaultHandlers::fault_in_source_scope(make_fault({"/perception/lidar/lidar_sensor"}), {})); +} + +TEST(FaultInSourceScopeTest, EmptyReportingSourcesIsOutOfScope) { + EXPECT_FALSE(FaultHandlers::fault_in_source_scope(make_fault({}), {"/perception/lidar/lidar_sensor"})); +} + +TEST(FaultInSourceScopeTest, MissingReportingSourcesFieldIsOutOfScope) { + json fault; + fault["fault_code"] = "F1"; + EXPECT_FALSE(FaultHandlers::fault_in_source_scope(fault, {"/perception/lidar/lidar_sensor"})); +} + +TEST(FaultInSourceScopeTest, NonStringSourceEntryIsOutOfScope) { + json fault; + fault["fault_code"] = "F1"; + fault["reporting_sources"] = json::array(); + fault["reporting_sources"].push_back("/perception/lidar/lidar_sensor"); + fault["reporting_sources"].push_back(42); // Malformed entry + EXPECT_FALSE(FaultHandlers::fault_in_source_scope(fault, {"/perception/lidar/lidar_sensor"})); +} + +TEST(FaultInSourceScopeTest, RootFqnEdgeCase) { + // Boundary check still works with "/" prefix candidates (artificial but + // exercises the index arithmetic): "/" itself can't match "/anything" + // unless the scope set actually contains "/" - which our resolver never + // emits. This pin catches a future regression that special-cased "/". + EXPECT_FALSE(FaultHandlers::fault_in_source_scope(make_fault({"/something"}), {"/other"})); +} diff --git a/src/ros2_medkit_gateway/test/test_handler_context.cpp b/src/ros2_medkit_gateway/test/test_handler_context.cpp index 1c64037c..f06f7792 100644 --- a/src/ros2_medkit_gateway/test/test_handler_context.cpp +++ b/src/ros2_medkit_gateway/test/test_handler_context.cpp @@ -1358,6 +1358,68 @@ TEST(ResolveEntitySourceFqnsTest, FunctionAggregatesHostedAppFqns) { EXPECT_EQ(fqns, expected); } +TEST(ResolveEntitySourceFqnsTest, AreaWalksNestedSubareasRecursively) { + // Real manifests put components under subareas (e.g. demo manifest attaches + // components to `engine` which is a subarea of `powertrain`). A query for + // the top-level area must walk subareas, otherwise the demo would 404 on + // every `/areas/powertrain/...` fault read. + ThreadSafeEntityCache cache; + Area powertrain; + powertrain.id = "powertrain"; + powertrain.namespace_path = "/powertrain"; + Area engine; + engine.id = "engine"; + engine.namespace_path = "/powertrain/engine"; + engine.parent_area_id = "powertrain"; + Area perception; + perception.id = "perception"; + perception.namespace_path = "/perception"; + cache.update_areas({powertrain, engine, perception}); + + Component engine_ecu; + engine_ecu.id = "engine-ecu"; + engine_ecu.area = "engine"; // attached to subarea, NOT to top-level + Component lidar_unit; + lidar_unit.id = "lidar-unit"; + lidar_unit.area = "perception"; + cache.update_components({engine_ecu, lidar_unit}); + + cache.update_apps({ + make_owned_app("engine-temp-sensor", "engine-ecu", "temp_sensor", "/powertrain/engine"), + make_owned_app("lidar-sensor", "lidar-unit", "lidar_sensor", "/perception/lidar"), + }); + + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::AREA, "powertrain")); + EXPECT_EQ(fqns, std::set{"/powertrain/engine/temp_sensor"}); +} + +TEST(ResolveEntitySourceFqnsTest, FunctionHostingComponentExpandsToComponentApps) { + // Function.hosts can carry component IDs as well as app IDs (see + // function.hpp). The cache's function_to_apps_ index only resolves + // app-host entries, so the helper must reach into Function.hosts itself + // and expand component hosts to the apps they own. + ThreadSafeEntityCache cache; + cache.update_components({Component{}}); // touch to ensure indexes settle + Component drive_ecu; + drive_ecu.id = "drive-ecu"; + cache.update_components({drive_ecu}); + + cache.update_apps({ + make_owned_app("planner", "drive-ecu", "planner_node", "/drive"), + make_owned_app("localizer", "drive-ecu", "localizer_node", "/drive"), + make_owned_app("standalone", "", "standalone_node", "/misc"), + }); + + Function autonomy; + autonomy.id = "autonomy"; + autonomy.hosts = {"drive-ecu", "standalone"}; // mix of component + app host + cache.update_functions({autonomy}); + + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::FUNCTION, "autonomy")); + std::set expected{"/drive/planner_node", "/drive/localizer_node", "/misc/standalone_node"}; + EXPECT_EQ(fqns, expected); +} + TEST(ResolveEntitySourceFqnsTest, FunctionWithUnboundHostsReturnsOnlyResolvedFqns) { // Function.hosts referencing an app that does not produce an effective_fqn // (no bound_fqn and no ros_binding) must be skipped, not return empty FQNs diff --git a/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py b/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py index 170dc2a3..786c6c47 100644 --- a/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py +++ b/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py @@ -71,6 +71,20 @@ class TestFaultsScopeIsolation(GatewayTestCase): def setUp(self): super().setUp() + # Verify both components exist before running scope-leak assertions. + # Without this, a 404 from an undiscovered entity would be + # indistinguishable from a 404 produced by the scope filter, and the + # regression test would silently pass even if the fix was reverted. + for comp_id in (LIDAR_OWNER_COMPONENT, OTHER_COMPONENT): + response = requests.get( + f'{self.BASE_URL}/components/{comp_id}', timeout=10, + ) + self.assertEqual( + response.status_code, 200, + f'Precondition failed: component {comp_id} was not discovered ' + f'(GET returned {response.status_code}); the 404 assertions ' + f'below would pass for the wrong reason.', + ) # The lidar app reports LIDAR_RANGE_INVALID deterministically when # launched with faulty params. Wait until the gateway sees it on the # owning app before exercising the per-component routes. From 95555f7b40c2a9335a1cfa9c44f6e36d57b8209c Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Thu, 14 May 2026 15:45:24 +0200 Subject: [PATCH 4/6] fix(gateway,#395): address N1-N5 from second self-review pass N1: Per-entity App fault list response no longer emits the global muted_count / cluster_count / muted_faults / clusters correlation metadata. include_muted / include_clusters URL params are inert on per-entity routes - they remain on the global GET /api/v1/faults route where the aggregates are meaningful. N2: ClearFault.srv gains a `skip_correlation_auto_clear` request flag. fault_manager_node honors it: when true, the correlation engine is NOT asked to compute the auto-clear cascade for the cleared fault code, and auto_cleared_codes in the response is empty. Per-entity DELETE routes (both single-fault and clear-all) set the flag to true so a scoped clear cannot reach faults reported by apps in other entities via auto_clear_with_root rules. Global DELETE /api/v1/faults keeps the flag false to preserve cluster-wide clearing. N3: handle_list_faults and handle_clear_all_faults Function/Area branches now route through HandlerContext::resolve_entity_source_fqns. Previously the Function branch used cache.get_entity_configurations() which only resolves App hosts (so a Function whose `hosts` list carried a Component ID returned an empty FQN set), and the Area branch walked cache.get_components_for_area() without recursing into subareas (so a top-level area like `powertrain` whose components lived in the `engine` subarea returned an empty set). Both classes are fixed via the shared helper. N4: 404 messages on per-fault GET/DELETE now describe the actual all-sources rule: a fault must have every reporting_source in the entity's scope; mixed-source faults that include any out-of-entity reporter are rejected. The previous wording said "not reported by any app within scope" which was wrong for the mixed-source case. N5: CHANGELOG behavior-change description now matches the implementation - the rule is "every reporting_source must be in scope", not just "reporting_sources must intersect". Documents the include_muted / include_clusters drop and the new ClearFault flag for per-entity DELETE. Test changes: - MockFaultServiceTransport in test_fault_manager_routing tracks the new skip_correlation_auto_clear arg. - Two new FaultManagerRoutingTest cases pin the default (false, for global routes) and the per-entity routing (true, for scoped DELETE). --- .../src/fault_manager_node.cpp | 7 +- src/ros2_medkit_gateway/CHANGELOG.rst | 2 +- .../core/managers/fault_manager.hpp | 6 +- .../transports/fault_service_transport.hpp | 7 +- .../ros2_fault_service_transport.hpp | 2 +- .../src/core/managers/fault_manager.cpp | 4 +- .../src/http/handlers/fault_handlers.cpp | 269 +++++------------- .../ros2_fault_service_transport.cpp | 3 +- .../test/test_fault_manager_routing.cpp | 31 +- src/ros2_medkit_msgs/srv/ClearFault.srv | 8 + 10 files changed, 130 insertions(+), 209 deletions(-) diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index e8880992..a9c8d742 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -530,9 +530,12 @@ void FaultManagerNode::handle_clear_fault( return; } - // Process through correlation engine first (to get auto-clear list) + // Process through correlation engine first (to get auto-clear list). + // `skip_correlation_auto_clear` lets the caller opt out of cascade-clearing + // correlated symptom fault codes. Per-entity DELETE routes set it to true + // so they cannot reach across entity boundaries via the correlation graph. std::vector auto_cleared_codes; - if (correlation_engine_) { + if (correlation_engine_ && !request->skip_correlation_auto_clear) { auto clear_result = correlation_engine_->process_clear(request->fault_code); auto_cleared_codes = clear_result.auto_cleared_codes; } diff --git a/src/ros2_medkit_gateway/CHANGELOG.rst b/src/ros2_medkit_gateway/CHANGELOG.rst index 4657166c..1c811292 100644 --- a/src/ros2_medkit_gateway/CHANGELOG.rst +++ b/src/ros2_medkit_gateway/CHANGELOG.rst @@ -7,7 +7,7 @@ Unreleased **Breaking Changes:** -* Per-entity fault routes are now correctly scoped to the entity's hosted apps. ``GET /api/v1/{entity-path}/faults/{fault_code}``, ``DELETE /api/v1/{entity-path}/faults/{fault_code}``, ``GET /api/v1/{entity-path}/faults``, and ``DELETE /api/v1/{entity-path}/faults`` previously fell back to a prefix match against the entity's ``namespace_path``; when that was empty (host-derived / synthetic components, manifest components without a ``namespace`` field, Areas, Functions, and Apps with a wildcard ``ros_binding.namespace_pattern``) the scope filter was silently disabled and the routes exposed - and on ``DELETE``, cleared - faults reported by apps that belonged to entirely different entities. All four handlers now resolve the addressed entity to its hosted-app FQN set (via the new ``HandlerContext::resolve_entity_source_fqns`` helper) and apply that set as a positive scope filter. Per-fault routes return ``404 Resource Not Found`` when the fault's ``reporting_sources`` does not intersect the set; collection routes return an empty ``items`` array. The underlying ``GetFault.srv`` / ``ClearFault.srv`` contracts are unchanged. Any client that previously received a ``200`` / ``204`` (single fault) or a populated ``items`` array for faults outside an entity's scope will now get ``404`` or an empty list (`#395 `_) +* Per-entity fault routes are now correctly scoped to the entity's hosted apps. ``GET /api/v1/{entity-path}/faults/{fault_code}``, ``DELETE /api/v1/{entity-path}/faults/{fault_code}``, ``GET /api/v1/{entity-path}/faults``, and ``DELETE /api/v1/{entity-path}/faults`` previously fell back to a prefix match against the entity's ``namespace_path``; when that was empty (host-derived / synthetic components, manifest components without a ``namespace`` field, Areas, Functions, and Apps with a wildcard ``ros_binding.namespace_pattern``) the scope filter was silently disabled and the routes exposed - and on ``DELETE``, cleared - faults reported by apps that belonged to entirely different entities. All four handlers now resolve the addressed entity to its hosted-app FQN set (via the new ``HandlerContext::resolve_entity_source_fqns`` helper) and apply a strict all-sources scope check: a fault counts as in scope only when **every** entry in its ``reporting_sources`` is owned by the entity (exact FQN match, or strict path-child via ``/...``). Per-fault routes return ``404 Resource Not Found`` for any fault that fails the check; collection routes return an empty ``items`` array. The underlying ``GetFault.srv`` contract is unchanged; ``ClearFault.srv`` gains a new ``skip_correlation_auto_clear`` request flag so per-entity DELETE can opt out of cascade-clearing correlated symptom fault codes that may live in other entities. Per-entity collection responses no longer include the global ``muted_count`` / ``cluster_count`` / ``muted_faults`` / ``clusters`` correlation metadata; those remain on the global ``GET /api/v1/faults`` route. Behavior changes visible to clients: (a) faults reported by apps outside the addressed entity are no longer returned or cleared via that entity's route, (b) **mixed-source** faults that include at least one out-of-entity reporter are likewise rejected with ``404`` on per-fault routes and excluded from per-entity collection responses (use the global ``GET /api/v1/faults`` to see them), (c) per-entity DELETE no longer cascade-clears correlated symptoms outside the entity (`#395 `_) * ``GET /api/v1/updates/{id}/status`` no longer returns ``404`` for a registered-but-idle package; ``POST /api/v1/updates`` now seeds a ``pending`` status, so the endpoint returns ``200 {"status": "pending"}`` immediately after registration. ``404`` is reserved for packages that are not registered. Clients that used ``404`` as a signal for "registered but nothing started yet" must adapt (`#378 `_) **Features:** diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/managers/fault_manager.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/managers/fault_manager.hpp index 85798df2..2b6627b7 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/managers/fault_manager.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/managers/fault_manager.hpp @@ -79,8 +79,10 @@ class FaultManager { /// Get a specific fault by code (JSON result - "fault" body only). FaultResult get_fault(const std::string & fault_code, const std::string & source_id = ""); - /// Clear a fault. - FaultResult clear_fault(const std::string & fault_code); + /// Clear a fault. When `skip_correlation_auto_clear` is true the fault + /// manager will not cascade-clear correlated symptom fault codes - per-entity + /// DELETE routes set this to keep their clear inside the entity boundary. + FaultResult clear_fault(const std::string & fault_code, bool skip_correlation_auto_clear = false); /// Get snapshots for a fault (optional topic filter). FaultResult get_snapshots(const std::string & fault_code, const std::string & topic = ""); diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/transports/fault_service_transport.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/transports/fault_service_transport.hpp index 63963d08..8a4c1288 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/transports/fault_service_transport.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/transports/fault_service_transport.hpp @@ -47,7 +47,12 @@ class FaultServiceTransport { virtual FaultResult get_fault(const std::string & fault_code, const std::string & source_id) = 0; - virtual FaultResult clear_fault(const std::string & fault_code) = 0; + /// Clear a fault by its fault_code. + /// `skip_correlation_auto_clear`, when true, asks the fault manager to NOT + /// cascade-clear correlated symptom fault codes. Per-entity DELETE routes + /// set this to true so the clear cannot reach faults reported by apps + /// outside the addressed entity via the correlation graph. + virtual FaultResult clear_fault(const std::string & fault_code, bool skip_correlation_auto_clear = false) = 0; virtual FaultResult get_snapshots(const std::string & fault_code, const std::string & topic) = 0; diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/ros2/transports/ros2_fault_service_transport.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/ros2/transports/ros2_fault_service_transport.hpp index 7d8ddf96..d6766b22 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/ros2/transports/ros2_fault_service_transport.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/ros2/transports/ros2_fault_service_transport.hpp @@ -70,7 +70,7 @@ class Ros2FaultServiceTransport : public FaultServiceTransport { FaultResult get_fault(const std::string & fault_code, const std::string & source_id) override; - FaultResult clear_fault(const std::string & fault_code) override; + FaultResult clear_fault(const std::string & fault_code, bool skip_correlation_auto_clear) override; FaultResult get_snapshots(const std::string & fault_code, const std::string & topic) override; diff --git a/src/ros2_medkit_gateway/src/core/managers/fault_manager.cpp b/src/ros2_medkit_gateway/src/core/managers/fault_manager.cpp index 68f22b20..cb02ef22 100644 --- a/src/ros2_medkit_gateway/src/core/managers/fault_manager.cpp +++ b/src/ros2_medkit_gateway/src/core/managers/fault_manager.cpp @@ -41,8 +41,8 @@ FaultResult FaultManager::get_fault(const std::string & fault_code, const std::s return transport_->get_fault(fault_code, source_id); } -FaultResult FaultManager::clear_fault(const std::string & fault_code) { - return transport_->clear_fault(fault_code); +FaultResult FaultManager::clear_fault(const std::string & fault_code, bool skip_correlation_auto_clear) { + return transport_->clear_fault(fault_code, skip_correlation_auto_clear); } FaultResult FaultManager::get_snapshots(const std::string & fault_code, const std::string & topic) { diff --git a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp index 6d1273b1..e16740e6 100644 --- a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp +++ b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp @@ -431,9 +431,12 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re return; } - // Parse correlation query parameters - bool include_muted = req.get_param_value("include_muted") == "true"; - bool include_clusters = req.get_param_value("include_clusters") == "true"; + // Note: include_muted / include_clusters URL params are intentionally + // ignored on per-entity routes - the underlying service returns + // correlation metadata computed across the entire fault manager, so + // emitting it on a scoped response would leak cross-entity data + // (review finding N1). The global `GET /faults` route is the only place + // these flags are honored. auto fault_mgr = ctx_.node()->get_fault_manager(); @@ -441,10 +444,10 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re // Functions don't have a single namespace_path (it is always empty in EntityInfo) // because they host apps from potentially different namespaces. // Instead, we collect the FQNs of all host apps and filter by reporting_source. - if (entity_info.type == EntityType::FUNCTION) { + if (entity_info.type == EntityType::FUNCTION || entity_info.type == EntityType::COMPONENT) { // Get all faults (no namespace filter) auto result = fault_mgr->list_faults("", filter.include_pending, filter.include_confirmed, filter.include_cleared, - filter.include_healed, include_muted, include_clusters); + filter.include_healed, /*include_muted=*/false, /*include_clusters=*/false); if (!result.success) { HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to get faults", @@ -452,30 +455,26 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re return; } - // Collect host app FQNs for filtering + // Resolve FQN set via the shared helper so subareas (Areas) and + // component-hosted Functions are handled the same way as the per-fault + // routes - the previous Function branch only walked + // `get_entity_configurations` and dropped component hosts; the previous + // Component branch was correct but is now consolidated for parity. const auto & cache = ctx_.node()->get_thread_safe_cache(); - auto agg_configs = cache.get_entity_configurations(entity_id); - std::set host_fqns; - for (const auto & node : agg_configs.nodes) { - if (!node.node_fqn.empty()) { - host_fqns.insert(node.node_fqn); - } - } + auto source_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); - // Filter faults to only those from function's host apps - json filtered_faults = filter_faults_by_sources(result.data["faults"], host_fqns); + json filtered_faults = filter_faults_by_sources(result.data["faults"], source_fqns); - // Build response json response = {{"items", filtered_faults}}; XMedkit ext; ext.entity_id(entity_id); - ext.add("aggregation_level", "function"); + const bool is_function = entity_info.type == EntityType::FUNCTION; + ext.add("aggregation_level", is_function ? "function" : "component"); ext.add("aggregated", true); - ext.add("host_count", host_fqns.size()); - // Include source app IDs for cross-referencing aggregated results + ext.add(is_function ? "host_count" : "app_count", source_fqns.size()); json source_ids = json::array(); - for (const auto & fqn : host_fqns) { + for (const auto & fqn : source_fqns) { source_ids.push_back(fqn); } ext.add("aggregation_sources", source_ids); @@ -487,64 +486,11 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re return; } - // For Components, aggregate faults from all hosted apps - // Components group Apps, so we filter by the apps' FQNs rather than namespace (which is too broad) - if (entity_info.type == EntityType::COMPONENT) { - // Get all faults (no namespace filter) - auto result = fault_mgr->list_faults("", filter.include_pending, filter.include_confirmed, filter.include_cleared, - filter.include_healed, include_muted, include_clusters); - - if (!result.success) { - HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to get faults", - {{"details", result.error_message}, {entity_info.id_field, entity_id}}); - return; - } - - // Collect hosted app FQNs for filtering - const auto & cache = ctx_.node()->get_thread_safe_cache(); - auto app_ids = cache.get_apps_for_component(entity_id); - std::set app_fqns; - for (const auto & app_id : app_ids) { - auto app = cache.get_app(app_id); - if (app) { - auto fqn = app->effective_fqn(); - if (!fqn.empty()) { - app_fqns.insert(std::move(fqn)); - } - } - } - - // Filter faults to only those from component's hosted apps - json filtered_faults = filter_faults_by_sources(result.data["faults"], app_fqns); - - // Build response - json response = {{"items", filtered_faults}}; - - XMedkit ext; - ext.entity_id(entity_id); - ext.add("aggregation_level", "component"); - ext.add("aggregated", true); - ext.add("app_count", app_fqns.size()); - // Include source app FQNs for cross-referencing aggregated results - json source_fqns = json::array(); - for (const auto & fqn : app_fqns) { - source_fqns.push_back(fqn); - } - ext.add("aggregation_sources", source_fqns); - - merge_peer_items(ctx_.aggregation_manager(), req, response, ext); - ext.add("count", response["items"].size()); - response["x-medkit"] = ext.build(); - HandlerContext::send_json(res, response); - return; - } - // For Areas, aggregate faults from all apps in all components within the area // This is an x-medkit extension - SOVD spec does not define fault collections for Areas if (entity_info.type == EntityType::AREA) { - // Get all faults (no namespace filter) auto result = fault_mgr->list_faults("", filter.include_pending, filter.include_confirmed, filter.include_cleared, - filter.include_healed, include_muted, include_clusters); + filter.include_healed, /*include_muted=*/false, /*include_clusters=*/false); if (!result.success) { HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to get faults", @@ -552,36 +498,23 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re return; } - // Collect FQNs from all apps in all components belonging to this area + // Resolve via the shared helper so the BFS over `get_subareas()` reaches + // components attached to nested subareas (the demo manifest puts every + // component on a subarea, so a direct `get_components_for_area` walk + // would resolve to the empty set and silently 404 every area route). const auto & cache = ctx_.node()->get_thread_safe_cache(); - auto comp_ids = cache.get_components_for_area(entity_id); - std::set app_fqns; - for (const auto & comp_id : comp_ids) { - auto app_ids = cache.get_apps_for_component(comp_id); - for (const auto & app_id : app_ids) { - auto app = cache.get_app(app_id); - if (app) { - auto fqn = app->effective_fqn(); - if (!fqn.empty()) { - app_fqns.insert(std::move(fqn)); - } - } - } - } + auto app_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); - // Filter faults to only those from the area's apps json filtered_faults = filter_faults_by_sources(result.data["faults"], app_fqns); - // Build response json response = {{"items", filtered_faults}}; XMedkit ext; ext.entity_id(entity_id); ext.add("aggregation_level", "area"); ext.add("aggregated", true); - ext.add("component_count", comp_ids.size()); + ext.add("component_count", cache.get_components_for_area(entity_id).size()); ext.add("app_count", app_fqns.size()); - // Include source app FQNs for cross-referencing aggregated results json area_source_fqns = json::array(); for (const auto & fqn : app_fqns) { area_source_fqns.push_back(fqn); @@ -603,8 +536,15 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re // Components - fix it the same way. const auto & cache = ctx_.node()->get_thread_safe_cache(); auto app_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); + // include_muted / include_clusters intentionally NOT forwarded: the + // service returns muted/cluster aggregates computed across the entire + // fault manager, not against the entity's app FQN set, so emitting them + // on a per-entity response would leak cross-entity correlation metadata + // (review finding N1). Clients that need system-wide correlation data + // use the global `GET /faults` route. auto result = fault_mgr->list_faults("", filter.include_pending, filter.include_confirmed, filter.include_cleared, - filter.include_healed, include_muted, include_clusters); + filter.include_healed, + /*include_muted=*/false, /*include_clusters=*/false); if (result.success) { json filtered_faults = filter_faults_by_sources(result.data["faults"], app_fqns); @@ -617,18 +557,8 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re ext.entity_id(entity_id); ext.add("source_id", entity_info.namespace_path); - // Include detailed correlation data if requested and present - if (result.data.contains("muted_faults")) { - ext.add("muted_faults", result.data["muted_faults"]); - } - if (result.data.contains("clusters")) { - ext.add("clusters", result.data["clusters"]); - } - merge_peer_items(ctx_.aggregation_manager(), req, response, ext); ext.add("count", response["items"].size()); - ext.add("muted_count", result.data["muted_count"]); - ext.add("cluster_count", result.data["cluster_count"]); response["x-medkit"] = ext.build(); HandlerContext::send_json(res, response); } else { @@ -722,10 +652,14 @@ void FaultHandlers::handle_get_fault(const httplib::Request & req, httplib::Resp const auto & cache = ctx_.node()->get_thread_safe_cache(); auto source_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); if (!FaultHandlers::fault_in_source_scope(fault_json, source_fqns)) { - HandlerContext::send_error(res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", - {{"details", "Fault is not reported by any app within this entity's scope"}, - {entity_info.id_field, entity_id}, - {"fault_code", fault_code}}); + HandlerContext::send_error( + res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", + {{"details", + "Fault is not in scope for this entity: every reporting source must be one of the entity's " + "owned apps, and a mixed-source fault that includes any out-of-entity reporter is rejected " + "to prevent cross-entity disclosure"}, + {entity_info.id_field, entity_id}, + {"fault_code", fault_code}}); return; } @@ -839,14 +773,22 @@ void FaultHandlers::handle_clear_fault(const httplib::Request & req, httplib::Re auto source_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); const auto & fault_json = get_result.data.value("fault", json::object()); if (!FaultHandlers::fault_in_source_scope(fault_json, source_fqns)) { - HandlerContext::send_error(res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", - {{"details", "Fault is not reported by any app within this entity's scope"}, - {entity_info.id_field, entity_id}, - {"fault_code", fault_code}}); + HandlerContext::send_error( + res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", + {{"details", + "Fault is not in scope for this entity: every reporting source must be one of the entity's " + "owned apps, and a mixed-source fault that includes any out-of-entity reporter is rejected " + "to prevent cross-entity disclosure"}, + {entity_info.id_field, entity_id}, + {"fault_code", fault_code}}); return; } - auto result = fault_mgr->clear_fault(fault_code); + // Pass `skip_correlation_auto_clear=true` so this scoped DELETE cannot + // cascade-clear correlated symptom fault codes that may be reported by + // apps in other entities. The cluster-wide clear behaviour is preserved + // on the global `DELETE /api/v1/faults` route below. + auto result = fault_mgr->clear_fault(fault_code, /*skip_correlation_auto_clear=*/true); if (result.success) { // Format: return 204 No Content on successful delete @@ -944,97 +886,28 @@ void FaultHandlers::handle_clear_all_faults(const httplib::Request & req, httpli auto fault_mgr = ctx_.node()->get_fault_manager(); - // For non-App entities (Functions, Components, Areas), namespace_path is - // either empty or too broad for accurate filtering. Use the same FQN-based - // approach as handle_list_faults: collect host app FQNs and filter by - // reporting_source match. - json faults_to_clear; - - if (entity_info.type == EntityType::FUNCTION) { - auto result = fault_mgr->list_faults(""); - if (!result.success) { - HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to retrieve faults", - {{"details", result.error_message}, {entity_info.id_field, entity_id}}); - return; - } - const auto & cache = ctx_.node()->get_thread_safe_cache(); - auto agg_configs = cache.get_entity_configurations(entity_id); - std::set host_fqns; - for (const auto & node : agg_configs.nodes) { - if (!node.node_fqn.empty()) { - host_fqns.insert(node.node_fqn); - } - } - faults_to_clear = filter_faults_by_sources(result.data["faults"], host_fqns); - - } else if (entity_info.type == EntityType::COMPONENT) { - auto result = fault_mgr->list_faults(""); - if (!result.success) { - HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to retrieve faults", - {{"details", result.error_message}, {entity_info.id_field, entity_id}}); - return; - } - const auto & cache = ctx_.node()->get_thread_safe_cache(); - auto app_ids = cache.get_apps_for_component(entity_id); - std::set app_fqns; - for (const auto & app_id : app_ids) { - auto app = cache.get_app(app_id); - if (app) { - auto fqn = app->effective_fqn(); - if (!fqn.empty()) { - app_fqns.insert(std::move(fqn)); - } - } - } - faults_to_clear = filter_faults_by_sources(result.data["faults"], app_fqns); - - } else if (entity_info.type == EntityType::AREA) { - auto result = fault_mgr->list_faults(""); - if (!result.success) { - HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to retrieve faults", - {{"details", result.error_message}, {entity_info.id_field, entity_id}}); - return; - } - const auto & cache = ctx_.node()->get_thread_safe_cache(); - auto comp_ids = cache.get_components_for_area(entity_id); - std::set app_fqns; - for (const auto & comp_id : comp_ids) { - auto app_ids_inner = cache.get_apps_for_component(comp_id); - for (const auto & app_id : app_ids_inner) { - auto app = cache.get_app(app_id); - if (app) { - auto fqn = app->effective_fqn(); - if (!fqn.empty()) { - app_fqns.insert(std::move(fqn)); - } - } - } - } - faults_to_clear = filter_faults_by_sources(result.data["faults"], app_fqns); - - } else { - // Apps: scope by the app's effective FQN. Cannot use the transport-level - // prefix filter (`list_faults(namespace_path)`) directly because Apps - // with a wildcard `ros_binding.namespace_pattern` produce an empty - // effective_fqn, the transport silently disables filtering, and we - // would clear every fault in the system. Same bug class as #395. - auto result = fault_mgr->list_faults(""); - if (!result.success) { - HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to retrieve faults", - {{"details", result.error_message}, {entity_info.id_field, entity_id}}); - return; - } - const auto & cache = ctx_.node()->get_thread_safe_cache(); - auto app_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); - faults_to_clear = filter_faults_by_sources(result.data["faults"], app_fqns); + // Same scope rules as `handle_list_faults` and `handle_get_fault`: every + // entity type resolves through `HandlerContext::resolve_entity_source_fqns` + // so the area BFS, function-hosting-component expansion, and wildcard-app + // empty-set behavior stay consistent across all four fault routes. + auto result = fault_mgr->list_faults(""); + if (!result.success) { + HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to retrieve faults", + {{"details", result.error_message}, {entity_info.id_field, entity_id}}); + return; } + const auto & cache = ctx_.node()->get_thread_safe_cache(); + auto entity_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); + json faults_to_clear = filter_faults_by_sources(result.data["faults"], entity_fqns); - // Clear each matching fault + // Clear each matching fault. Use `skip_correlation_auto_clear=true` for + // the same reason as the single-fault DELETE: keep this entity's clear + // from cascading into correlated symptoms reported by other entities. if (faults_to_clear.is_array()) { for (const auto & fault : faults_to_clear) { if (fault.contains("fault_code")) { std::string fault_code = fault["fault_code"].get(); - auto clear_result = fault_mgr->clear_fault(fault_code); + auto clear_result = fault_mgr->clear_fault(fault_code, /*skip_correlation_auto_clear=*/true); if (!clear_result.success) { RCLCPP_WARN(HandlerContext::logger(), "Failed to clear fault '%s' for entity '%s': %s", fault_code.c_str(), entity_id.c_str(), clear_result.error_message.c_str()); diff --git a/src/ros2_medkit_gateway/src/ros2/transports/ros2_fault_service_transport.cpp b/src/ros2_medkit_gateway/src/ros2/transports/ros2_fault_service_transport.cpp index 1e531d63..a48dfcf2 100644 --- a/src/ros2_medkit_gateway/src/ros2/transports/ros2_fault_service_transport.cpp +++ b/src/ros2_medkit_gateway/src/ros2/transports/ros2_fault_service_transport.cpp @@ -281,7 +281,7 @@ FaultResult Ros2FaultServiceTransport::get_fault(const std::string & fault_code, return result; } -FaultResult Ros2FaultServiceTransport::clear_fault(const std::string & fault_code) { +FaultResult Ros2FaultServiceTransport::clear_fault(const std::string & fault_code, bool skip_correlation_auto_clear) { std::lock_guard lock(clear_mutex_); FaultResult result; @@ -294,6 +294,7 @@ FaultResult Ros2FaultServiceTransport::clear_fault(const std::string & fault_cod auto request = std::make_shared(); request->fault_code = fault_code; + request->skip_correlation_auto_clear = skip_correlation_auto_clear; auto future = clear_fault_client_->async_send_request(request); diff --git a/src/ros2_medkit_gateway/test/test_fault_manager_routing.cpp b/src/ros2_medkit_gateway/test/test_fault_manager_routing.cpp index 59fd6c5d..a30ba927 100644 --- a/src/ros2_medkit_gateway/test/test_fault_manager_routing.cpp +++ b/src/ros2_medkit_gateway/test/test_fault_manager_routing.cpp @@ -88,8 +88,9 @@ class MockFaultServiceTransport : public FaultServiceTransport { return r; } - FaultResult clear_fault(const std::string & fault_code) override { + FaultResult clear_fault(const std::string & fault_code, bool skip_correlation_auto_clear) override { last_clear_code_ = fault_code; + last_clear_skip_correlation_ = skip_correlation_auto_clear; ++clear_calls_; FaultResult r; r.success = clear_success_; @@ -180,6 +181,7 @@ class MockFaultServiceTransport : public FaultServiceTransport { json clear_data_ = json{{"success", true}, {"message", "ok"}}; std::string clear_error_; std::string last_clear_code_; + bool last_clear_skip_correlation_ = false; int clear_calls_ = 0; bool snapshots_success_ = true; @@ -316,6 +318,33 @@ TEST(FaultManagerRoutingTest, ClearFaultDelegatesAndReturnsAutoClearedCodes) { EXPECT_EQ(r.data["auto_cleared_codes"].size(), 2u); } +TEST(FaultManagerRoutingTest, ClearFaultDefaultsToCorrelationAutoClear) { + // Global `DELETE /api/v1/faults` and any unscoped caller should preserve + // the legacy auto-clear-with-root behaviour (skip flag defaults to false). + auto mock = std::make_shared(); + mock->clear_success_ = true; + FaultManager mgr(mock); + + mgr.clear_fault("ROOT"); + + EXPECT_EQ(mock->last_clear_code_, "ROOT"); + EXPECT_FALSE(mock->last_clear_skip_correlation_); +} + +TEST(FaultManagerRoutingTest, ClearFaultForwardsSkipCorrelationFlag) { + // Per-entity DELETE routes call `clear_fault(code, /*skip=*/true)` so the + // fault manager does NOT cascade-clear correlated symptoms reported by + // apps outside the addressed entity. Pin the routing here. + auto mock = std::make_shared(); + mock->clear_success_ = true; + FaultManager mgr(mock); + + mgr.clear_fault("ROOT", /*skip_correlation_auto_clear=*/true); + + EXPECT_EQ(mock->last_clear_code_, "ROOT"); + EXPECT_TRUE(mock->last_clear_skip_correlation_); +} + TEST(FaultManagerRoutingTest, GetSnapshotsRoutesTopicFilter) { auto mock = std::make_shared(); mock->snapshots_data_ = {{"topics", json::object()}}; diff --git a/src/ros2_medkit_msgs/srv/ClearFault.srv b/src/ros2_medkit_msgs/srv/ClearFault.srv index c8f3f883..cfa0b778 100644 --- a/src/ros2_medkit_msgs/srv/ClearFault.srv +++ b/src/ros2_medkit_msgs/srv/ClearFault.srv @@ -21,6 +21,14 @@ # Request fields string fault_code # The fault_code to clear + +# Disable auto-cleanup of correlated symptom faults. When false (default), +# clearing a root-cause fault also clears every symptom the correlation engine +# attributes to it (auto_clear_with_root rules). When true, only the requested +# fault_code is cleared and `auto_cleared_codes` in the response is empty. +# Set this from scoped per-entity DELETE routes so a viewer of one entity +# cannot cascade-clear symptom faults reported by apps in other entities. +bool skip_correlation_auto_clear --- # Response fields bool success # True if the fault was found and cleared From cbbd8b1867b463cb3d9080d088d6d7739cf3550c Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Tue, 26 May 2026 17:44:31 +0200 Subject: [PATCH 5/6] fix(gateway,#395): document shared-fault rule, srv hash, body non-disclosure Three follow-ups on the per-entity fault scope fix: * `docs/api/rest.rst`: add a per-entity-scope note under Faults Endpoints spelling out the all-sources rule and steering operators to the global `/faults` route for fault codes shared across entities. The CHANGELOG documented the rule, but the API reference is where operators look. * `CHANGELOG.rst`: split the `ClearFault.srv` hash change into its own Breaking Changes bullet. Adding the `skip_correlation_auto_clear` request field changes the service type hash, so out-of-tree callers that invoke `/fault_manager/clear_fault` directly must rebuild against the new `ros2_medkit_msgs`. The in-tree gateway client and server were already updated together. * `test_faults_scope_isolation.test.py`: the 404 tests on both GET and DELETE now also assert the response body carries none of the success-shape keys (`item`, `environment_data`, `reporting_sources`, `snapshots`) and does not echo the out-of-scope reporting app FQN. Non-disclosure is the actual security property; a regression that flipped the status code to 404 but kept serializing the fault snapshot would otherwise slip past status-only assertions. --- docs/api/rest.rst | 17 +++++++++ src/ros2_medkit_gateway/CHANGELOG.rst | 1 + .../test_faults_scope_isolation.test.py | 37 +++++++++++++++++++ 3 files changed, 55 insertions(+) diff --git a/docs/api/rest.rst b/docs/api/rest.rst index 044a77f3..993c5510 100644 --- a/docs/api/rest.rst +++ b/docs/api/rest.rst @@ -638,6 +638,23 @@ Query and manage faults. Faults are reported by ROS 2 nodes via the FaultReporter library, not via REST API. The gateway queries faults from the ros2_medkit_fault_manager node. +.. note:: + + **Per-entity fault scope (``/{entity-path}/faults`` routes).** The gateway keys + faults by ``fault_code`` only, and a fault's ``reporting_sources`` set is the + union of every app that has reported that code. Per-entity routes apply a + strict all-sources scope check: a fault is in scope for an entity iff **every** + entry in ``reporting_sources`` is an app owned by that entity (exact FQN + match, or strict path-child). + + This means a ``fault_code`` reported by apps in two different entities + (for example ``SENSOR_TIMEOUT`` reported by both the lidar and the + temperature sensor app) is **not** visible or clearable through either + entity's per-entity routes - per-fault routes return ``404``, collection + responses omit it, and per-entity ``DELETE`` skips it. To see, list, or + clear such shared faults use the global ``GET /api/v1/faults`` / + ``DELETE /api/v1/faults`` routes. + ``GET /api/v1/faults`` List all faults across the system. diff --git a/src/ros2_medkit_gateway/CHANGELOG.rst b/src/ros2_medkit_gateway/CHANGELOG.rst index 1c811292..7b8be3d1 100644 --- a/src/ros2_medkit_gateway/CHANGELOG.rst +++ b/src/ros2_medkit_gateway/CHANGELOG.rst @@ -7,6 +7,7 @@ Unreleased **Breaking Changes:** +* ``ros2_medkit_msgs/srv/ClearFault`` request gains a ``bool skip_correlation_auto_clear`` field (see the per-entity fault scope entry below for the in-tree motivation). Adding a request field changes the service type hash, so out-of-tree callers that invoke the service directly (for example ``ros2 service call /fault_manager/clear_fault ros2_medkit_msgs/srv/ClearFault ...`` as documented in the ``ros2_medkit_fault_manager`` README) must rebuild against the new ``ros2_medkit_msgs`` to keep talking to ``fault_manager``. The in-tree gateway client and server are updated together (`#395 `_) * Per-entity fault routes are now correctly scoped to the entity's hosted apps. ``GET /api/v1/{entity-path}/faults/{fault_code}``, ``DELETE /api/v1/{entity-path}/faults/{fault_code}``, ``GET /api/v1/{entity-path}/faults``, and ``DELETE /api/v1/{entity-path}/faults`` previously fell back to a prefix match against the entity's ``namespace_path``; when that was empty (host-derived / synthetic components, manifest components without a ``namespace`` field, Areas, Functions, and Apps with a wildcard ``ros_binding.namespace_pattern``) the scope filter was silently disabled and the routes exposed - and on ``DELETE``, cleared - faults reported by apps that belonged to entirely different entities. All four handlers now resolve the addressed entity to its hosted-app FQN set (via the new ``HandlerContext::resolve_entity_source_fqns`` helper) and apply a strict all-sources scope check: a fault counts as in scope only when **every** entry in its ``reporting_sources`` is owned by the entity (exact FQN match, or strict path-child via ``/...``). Per-fault routes return ``404 Resource Not Found`` for any fault that fails the check; collection routes return an empty ``items`` array. The underlying ``GetFault.srv`` contract is unchanged; ``ClearFault.srv`` gains a new ``skip_correlation_auto_clear`` request flag so per-entity DELETE can opt out of cascade-clearing correlated symptom fault codes that may live in other entities. Per-entity collection responses no longer include the global ``muted_count`` / ``cluster_count`` / ``muted_faults`` / ``clusters`` correlation metadata; those remain on the global ``GET /api/v1/faults`` route. Behavior changes visible to clients: (a) faults reported by apps outside the addressed entity are no longer returned or cleared via that entity's route, (b) **mixed-source** faults that include at least one out-of-entity reporter are likewise rejected with ``404`` on per-fault routes and excluded from per-entity collection responses (use the global ``GET /api/v1/faults`` to see them), (c) per-entity DELETE no longer cascade-clears correlated symptoms outside the entity (`#395 `_) * ``GET /api/v1/updates/{id}/status`` no longer returns ``404`` for a registered-but-idle package; ``POST /api/v1/updates`` now seeds a ``pending`` status, so the endpoint returns ``200 {"status": "pending"}`` immediately after registration. ``404`` is reserved for packages that are not registered. Clients that used ``404`` as a signal for "registered but nothing started yet" must adapt (`#378 `_) diff --git a/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py b/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py index 786c6c47..3db7a8f4 100644 --- a/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py +++ b/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py @@ -41,8 +41,20 @@ LIDAR_FAULT_CODE = 'LIDAR_RANGE_INVALID' LIDAR_OWNER_COMPONENT = 'lidar-unit' +LIDAR_REPORTING_APP_FQN = '/perception/lidar/lidar_sensor' OTHER_COMPONENT = 'temp-sensor-hw' +# JSON keys that only appear in the success-shape fault detail body +# (FaultHandlers::build_sovd_fault_response). If any of these surface in a +# 404 response body, the handler has switched the status code but is still +# serializing the out-of-scope fault, defeating the non-disclosure property. +LEAKED_BODY_KEYS = ( + 'item', + 'environment_data', + 'reporting_sources', + 'snapshots', +) + def generate_test_description(): pkg_share = get_package_share_directory('ros2_medkit_gateway') @@ -90,6 +102,29 @@ def setUp(self): # owning app before exercising the per-component routes. self.wait_for_fault('/apps/lidar-sensor', LIDAR_FAULT_CODE) + def assert_body_does_not_leak_fault(self, response): + """Assert a 404 response body carries no out-of-scope fault detail. + + The security property under test is non-disclosure: a regression + that returns 404 while still serializing the fault snapshot would + leak environment data, snapshots, and the owning app's FQN. Check + both the SOVD success-shape JSON keys and the raw owner-FQN string + so a future shape change cannot silently re-introduce the leak. + """ + body_text = response.text + body_json = response.json() + for key in LEAKED_BODY_KEYS: + self.assertNotIn( + key, body_json, + f'404 body must not carry success-shape key {key!r}: ' + f'body={body_text}', + ) + self.assertNotIn( + LIDAR_REPORTING_APP_FQN, body_text, + f'404 body must not disclose the out-of-scope reporting app FQN ' + f'{LIDAR_REPORTING_APP_FQN!r}: body={body_text}', + ) + def test_get_fault_returns_404_on_unrelated_component(self): """GET on a different component must not return another entity's fault. @@ -103,6 +138,7 @@ def test_get_fault_returns_404_on_unrelated_component(self): response.status_code, 404, f'Expected 404, got {response.status_code} body={response.text}', ) + self.assert_body_does_not_leak_fault(response) def test_get_fault_returns_200_on_owning_component(self): """GET on the component that hosts the reporting app must return the fault. @@ -127,6 +163,7 @@ def test_clear_fault_returns_404_on_unrelated_component(self): response.status_code, 404, f'Expected 404, got {response.status_code} body={response.text}', ) + self.assert_body_does_not_leak_fault(response) # Verify the fault is still present on the owning app afterwards. listing = self.get_json('/apps/lidar-sensor/faults') From 3dfd7eb34c81f20f83156233d5f51db41968bfc9 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Tue, 26 May 2026 18:30:33 +0200 Subject: [PATCH 6/6] docs(#395): document ClearFault.srv skip_correlation_auto_clear field Customer-facing docs across `messages.rst`, `ros2_medkit_msgs/README.md`, `ros2_medkit_fault_manager/README.md`, and the fault-correlation tutorial still listed the pre-fix `ClearFault.srv` shape (only `fault_code` in the request, `success` + `message` in the response). Update all four to: * List `skip_correlation_auto_clear` in the request, with the semantics spelled out (cascade clear by default; `true` opts out so the call clears only the named fault and leaves correlated symptoms alone) and the gateway routing rule (per-entity DELETE sets it to `true`, global DELETE leaves it `false`). * List `auto_cleared_codes` in the response (was already returned by the service, only the markdown table missed it). * Flag the service type-hash change for direct out-of-tree `ros2 service call` clients and generated clients - they must rebuild against the new `ros2_medkit_msgs` to keep talking to `fault_manager`. * Note in the fault-correlation tutorial that per-entity DELETE returns empty `auto_cleared_codes` and that operators who want the cascade must use the global `DELETE /api/v1/faults/{fault_code}` route. --- docs/api/messages.rst | 23 ++++++++++++++++++++++- docs/tutorials/fault-correlation.rst | 14 ++++++++++++++ src/ros2_medkit_fault_manager/README.md | 10 ++++++++-- src/ros2_medkit_msgs/README.md | 4 ++++ 4 files changed, 48 insertions(+), 3 deletions(-) diff --git a/docs/api/messages.rst b/docs/api/messages.rst index 5e23b98d..4ecd08e3 100644 --- a/docs/api/messages.rst +++ b/docs/api/messages.rst @@ -210,7 +210,8 @@ Clear/acknowledge a fault. .. code-block:: text - string fault_code # Fault code to clear + string fault_code # Fault code to clear + bool skip_correlation_auto_clear # Opt out of correlation cascade clear **Response:** @@ -220,6 +221,26 @@ Clear/acknowledge a fault. string message # Status message or error description string[] auto_cleared_codes # Symptoms auto-cleared with root cause +When ``skip_correlation_auto_clear`` is ``false`` (default), clearing a +root-cause fault also clears every symptom that the correlation engine +attributes to it via ``auto_clear_with_root`` rules; the cleared +symptom codes are returned in ``auto_cleared_codes``. When ``true``, +only the requested ``fault_code`` is cleared and ``auto_cleared_codes`` +is empty. The gateway's per-entity ``DELETE +/{entity-path}/faults/{fault_code}`` route sets this to ``true`` so +that an operator with access to one entity cannot cascade-clear +correlated symptoms reported by apps in other entities. The global +``DELETE /api/v1/faults/{fault_code}`` route leaves it ``false`` so +cluster-wide clearing still works. + +.. note:: + + Added in ``ros2_medkit_msgs`` post-0.4.0. Adding a request field + changes the service type hash, so out-of-tree callers that invoke + ``/fault_manager/clear_fault`` directly via ``ros2 service call`` or + a generated client must rebuild against the new ``ros2_medkit_msgs`` + release to keep talking to ``fault_manager``. + ListFaults.srv ~~~~~~~~~~~~~~ diff --git a/docs/tutorials/fault-correlation.rst b/docs/tutorials/fault-correlation.rst index a69c2d1c..07357780 100644 --- a/docs/tutorials/fault-correlation.rst +++ b/docs/tutorials/fault-correlation.rst @@ -383,6 +383,20 @@ Response always includes: "auto_cleared_codes": ["MOTOR_COMM_001", "MOTOR_TIMEOUT_002", "DRIVE_001"] } +.. note:: + + **Per-entity DELETE opts out of the cascade.** The same fault cleared + via ``DELETE /api/v1/{entity-path}/faults/ESTOP_001`` clears only + ``ESTOP_001`` itself - ``auto_cleared_codes`` will be empty in the + response. The gateway sets ``ClearFault.srv``'s + ``skip_correlation_auto_clear`` to ``true`` on per-entity routes so + that an operator with access to one entity cannot cascade-clear + correlated symptom faults reported by apps in other entities. Use + the global ``DELETE /api/v1/faults/{fault_code}`` route when you do + want the correlation cascade. Direct ``ros2 service call`` clients + can choose explicitly by setting ``skip_correlation_auto_clear`` in + the request body (see ``ros2_medkit_msgs`` ``ClearFault.srv``). + Example: Complete Configuration ------------------------------- diff --git a/src/ros2_medkit_fault_manager/README.md b/src/ros2_medkit_fault_manager/README.md index 6d12be32..bb8f2cd2 100644 --- a/src/ros2_medkit_fault_manager/README.md +++ b/src/ros2_medkit_fault_manager/README.md @@ -24,11 +24,17 @@ ros2 service call /fault_manager/report_fault ros2_medkit_msgs/srv/ReportFault \ ros2 service call /fault_manager/list_faults ros2_medkit_msgs/srv/ListFaults \ "{statuses: ['CONFIRMED']}" -# Clear a fault +# Clear a fault (cascade-clears correlated symptoms by default) ros2 service call /fault_manager/clear_fault ros2_medkit_msgs/srv/ClearFault \ - "{fault_code: 'MOTOR_OVERHEAT'}" + "{fault_code: 'MOTOR_OVERHEAT', skip_correlation_auto_clear: false}" + +# Clear without touching correlated symptoms +ros2 service call /fault_manager/clear_fault ros2_medkit_msgs/srv/ClearFault \ + "{fault_code: 'MOTOR_OVERHEAT', skip_correlation_auto_clear: true}" ``` +> **Note:** The `skip_correlation_auto_clear` request field was added post-0.4.0. Adding a request field changes the service type hash, so callers built against `ros2_medkit_msgs` 0.4.0 or earlier must rebuild to keep talking to `fault_manager`. + ## Services | Service | Type | Description | diff --git a/src/ros2_medkit_msgs/README.md b/src/ros2_medkit_msgs/README.md index c9e35d44..597747a3 100644 --- a/src/ros2_medkit_msgs/README.md +++ b/src/ros2_medkit_msgs/README.md @@ -125,12 +125,16 @@ Clear/acknowledge a fault. Cleared faults are retained and queryable with `statu | Field | Type | Description | |-------|------|-------------| | `fault_code` | string | The fault to clear | +| `skip_correlation_auto_clear` | bool | When `true`, only the requested fault_code is cleared; symptom faults that the correlation engine would normally auto-clear via `auto_clear_with_root` rules are left untouched. Default `false` (cascade clear). The gateway sets this to `true` on per-entity `DELETE /{entity-path}/faults/{fault_code}` so that an operator with access to one entity cannot cascade-clear correlated symptoms reported by apps in other entities. | **Response:** | Field | Type | Description | |-------|------|-------------| | `success` | bool | True if fault was cleared | | `message` | string | Status or error message | +| `auto_cleared_codes` | string[] | Symptom fault codes auto-cleared with the root cause (empty when `skip_correlation_auto_clear=true`) | + +> **Note:** `skip_correlation_auto_clear` was added in `ros2_medkit_msgs` post-0.4.0. Adding a request field changes the service type hash, so out-of-tree callers that invoke `/fault_manager/clear_fault` directly (via `ros2 service call` or a generated client) must rebuild against the new `ros2_medkit_msgs` release to keep talking to `fault_manager`. ## Usage