diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java index fbf1f46a9702..a57f6f9d7f4e 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java @@ -334,6 +334,22 @@ Map> getSafeModeRuleStatuses() */ boolean forceExitSafeMode() throws IOException; + /** + * Check if a specific SCM node is in safe mode. + * @param nodeId SCM node ID to query + * @return true if the node is in safe mode, false otherwise + * @throws IOException + */ + boolean inSafeModeForNode(String nodeId) throws IOException; + + /** + * Get safe mode rule statuses from a specific SCM node. + * @param nodeId SCM node ID to query + * @return Map of rule name to rule status + * @throws IOException + */ + Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException; + /** * Start ReplicationManager. */ diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java index 92ddfa7eb8dc..16a3bbc96468 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java @@ -73,6 +73,14 @@ public interface StorageContainerLocationProtocol extends Closeable { Type.StopReplicationManager, Type.ForceExitSafeMode)); + /** + * Read-only commands that can execute on followers without leader check. + * These commands respect the --scm parameter and query the specified SCM. + */ + Set FOLLOWER_READABLE_COMMAND_TYPES = Collections.unmodifiableSet(EnumSet.of( + Type.InSafeMode, + Type.GetSafeModeRuleStatuses)); + /** * Asks SCM where a container should be allocated. SCM responds with the * set of datanodes that should be used creating this container. @@ -390,6 +398,26 @@ Map> getSafeModeRuleStatuses() */ boolean forceExitSafeMode() throws IOException; + /** + * Check if a specific SCM node is in safe mode. + * In HA clusters, queries the specified node. + * + * @param nodeId SCM node ID to query + * @return true if the node is in safe mode, false otherwise + * @throws IOException + */ + boolean inSafeModeForNode(String nodeId) throws IOException; + + /** + * Get safe mode rule statuses from a specific SCM node. + * In HA clusters, queries the specified node. + * + * @param nodeId SCM node ID to query + * @return Map of rule name to rule status + * @throws IOException + */ + Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException; + /** * Start ReplicationManager. */ diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java index 94b2230e68ba..f29c7e940fa6 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java @@ -145,6 +145,8 @@ import org.apache.hadoop.ozone.upgrade.UpgradeFinalization.StatusAndMessages; import org.apache.hadoop.ozone.util.ProtobufUtils; import org.apache.hadoop.security.token.Token; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class is the client-side translator to translate the requests made on @@ -162,6 +164,8 @@ public final class StorageContainerLocationProtocolClientSideTranslatorPB private final StorageContainerLocationProtocolPB rpcProxy; private final SCMContainerLocationFailoverProxyProvider fpp; + private static final Logger LOG = + LoggerFactory.getLogger(StorageContainerLocationProtocolClientSideTranslatorPB.class); /** * Creates a new StorageContainerLocationProtocolClientSideTranslatorPB. @@ -200,6 +204,34 @@ private ScmContainerLocationResponse submitRequest( return response; } + /** + * Helper method to wrap the request and send the message to a specific SCM node. + * This is used for operations that need to query a specific SCM node in an HA cluster. + * + * @param nodeId the SCM node ID to send the request to + * @param type the request type + * @param builderConsumer consumer to populate the request specific fields + * @return the response from the specified SCM node + */ + private ScmContainerLocationResponse submitRequestToNode( + String nodeId, + StorageContainerLocationProtocolProtos.Type type, + Consumer builderConsumer) throws IOException { + try { + StorageContainerLocationProtocolPB proxy = fpp.getProxyForNode(nodeId); + Builder builder = ScmContainerLocationRequest.newBuilder() + .setCmdType(type) + .setVersion(ClientVersion.CURRENT_VERSION) + .setTraceID(TracingUtil.exportCurrentSpan()); + builderConsumer.accept(builder); + ScmContainerLocationRequest wrapper = builder.build(); + + return proxy.submitRequest(NULL_RPC_CONTROLLER, wrapper); + } catch (ServiceException ex) { + throw ProtobufHelper.getRemoteException(ex); + } + } + private ScmContainerLocationResponse submitRpcRequest( ScmContainerLocationRequest wrapper) throws ServiceException { if (!ADMIN_COMMAND_TYPE.contains(wrapper.getCmdType())) { @@ -843,13 +875,21 @@ public Map> getSafeModeRuleStatuses() submitRequest(Type.GetSafeModeRuleStatuses, builder -> builder.setGetSafeModeRuleStatusesRequest(request)) .getGetSafeModeRuleStatusesResponse(); - Map> map = new HashMap(); - for (SafeModeRuleStatusProto statusProto : - response.getSafeModeRuleStatusesProtoList()) { - map.put(statusProto.getRuleName(), + return buildSafeModeRuleStatusesMap(response); + } + + /** + * Helper method to build a map from GetSafeModeRuleStatusesResponseProto. + * Extracts rule names and their status information. + */ + private Map> buildSafeModeRuleStatusesMap( + GetSafeModeRuleStatusesResponseProto response) { + Map> ruleStatuses = new HashMap<>(); + for (SafeModeRuleStatusProto statusProto : response.getSafeModeRuleStatusesProtoList()) { + ruleStatuses.put(statusProto.getRuleName(), Pair.of(statusProto.getValidate(), statusProto.getStatusText())); } - return map; + return ruleStatuses; } /** @@ -870,6 +910,24 @@ public boolean forceExitSafeMode() throws IOException { } + @Override + public boolean inSafeModeForNode(String nodeId) throws IOException { + InSafeModeRequestProto request = InSafeModeRequestProto.getDefaultInstance(); + return submitRequestToNode(nodeId, Type.InSafeMode, + builder -> builder.setInSafeModeRequest(request)) + .getInSafeModeResponse().getInSafeMode(); + } + + @Override + public Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException { + GetSafeModeRuleStatusesRequestProto request = GetSafeModeRuleStatusesRequestProto.getDefaultInstance(); + GetSafeModeRuleStatusesResponseProto response = + submitRequestToNode(nodeId, Type.GetSafeModeRuleStatuses, + builder -> builder.setGetSafeModeRuleStatusesRequest(request)) + .getGetSafeModeRuleStatusesResponse(); + return buildSafeModeRuleStatusesMap(response); + } + @Override public void startReplicationManager() throws IOException { diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java index 0dfa95a95246..272db7a04ae4 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java @@ -193,6 +193,17 @@ public synchronized List getProxies() { .map(proxyInfo -> proxyInfo.proxy).collect(Collectors.toList()); } + public synchronized T getProxyForNode(String nodeId) throws IOException { + ProxyInfo proxyInfo = scmProxies.get(nodeId); + if (proxyInfo == null) { + if (!scmProxyInfoMap.containsKey(nodeId)) { + throw new IOException("Unknown SCM node ID: " + nodeId); + } + proxyInfo = createSCMProxy(nodeId); + } + return proxyInfo.proxy; + } + @Override public synchronized void performFailover(T newLeader) { if (updatedLeaderNodeID != null) { diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java index 3b061aa10c01..56294ec3590b 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java @@ -29,6 +29,7 @@ import static org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type.ListContainer; import static org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type.ListPipelines; import static org.apache.hadoop.hdds.scm.protocol.StorageContainerLocationProtocol.ADMIN_COMMAND_TYPE; +import static org.apache.hadoop.hdds.scm.protocol.StorageContainerLocationProtocol.FOLLOWER_READABLE_COMMAND_TYPES; import com.google.protobuf.ProtocolMessageEnum; import com.google.protobuf.RpcController; @@ -210,9 +211,12 @@ public StorageContainerLocationProtocolServerSideTranslatorPB( @Override public ScmContainerLocationResponse submitRequest(RpcController controller, ScmContainerLocationRequest request) throws ServiceException { - // not leader or not belong to admin command. + // Trigger not leader exception unless: + // This is the leader node, or this is an admin command, + // or this is a follower-readable command. if (!scm.checkLeader() - && !ADMIN_COMMAND_TYPE.contains(request.getCmdType())) { + && !ADMIN_COMMAND_TYPE.contains(request.getCmdType()) + && !FOLLOWER_READABLE_COMMAND_TYPES.contains(request.getCmdType())) { RatisUtil.checkRatisException( scm.getScmHAManager().getRatisServer().triggerNotLeaderException(), scm.getClientRpcPort(), scm.getScmId(), scm.getHostname(), ROLE_TYPE); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index 3bd5214ea0ec..4e70970ddae9 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -1033,6 +1033,42 @@ public Map> getSafeModeRuleStatuses() } } + @Override + public boolean inSafeModeForNode(String nodeId) throws IOException { + Map auditMap = Maps.newHashMap(); + auditMap.put("nodeId", nodeId); + try { + boolean result = scm.isInSafeMode(); + AUDIT.logReadSuccess( + buildAuditMessageForSuccess(SCMAction.IN_SAFE_MODE, auditMap) + ); + return result; + } catch (Exception ex) { + AUDIT.logReadFailure( + buildAuditMessageForFailure(SCMAction.IN_SAFE_MODE, auditMap, ex) + ); + throw ex; + } + } + + @Override + public Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException { + Map auditMap = Maps.newHashMap(); + auditMap.put("nodeId", nodeId); + try { + Map> result = scm.getRuleStatus(); + AUDIT.logReadSuccess( + buildAuditMessageForSuccess(SCMAction.GET_SAFE_MODE_RULE_STATUSES, auditMap) + ); + return result; + } catch (Exception ex) { + AUDIT.logReadFailure( + buildAuditMessageForFailure(SCMAction.GET_SAFE_MODE_RULE_STATUSES, auditMap, ex) + ); + throw ex; + } + } + /** * Force SCM out of Safe mode. * diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java index 61c0f4150c34..2c7c0f9a57b0 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java @@ -477,6 +477,16 @@ public boolean forceExitSafeMode() throws IOException { return storageContainerLocationClient.forceExitSafeMode(); } + @Override + public boolean inSafeModeForNode(String nodeId) throws IOException { + return storageContainerLocationClient.inSafeModeForNode(nodeId); + } + + @Override + public Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException { + return storageContainerLocationClient.getSafeModeRuleStatusesForNode(nodeId); + } + @Override public void startReplicationManager() throws IOException { storageContainerLocationClient.startReplicationManager(); diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java index d15be56410f8..bc44b66e2529 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java @@ -18,10 +18,19 @@ package org.apache.hadoop.hdds.scm.cli; import java.io.IOException; +import java.net.InetSocketAddress; +import java.util.List; import java.util.Map; +import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.hdds.HddsUtils; import org.apache.hadoop.hdds.cli.HddsVersionProvider; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.scm.client.ScmClient; +import org.apache.hadoop.hdds.scm.ha.SCMNodeInfo; +import org.apache.hadoop.net.NetUtils; +import picocli.CommandLine; import picocli.CommandLine.Command; /** @@ -33,24 +42,191 @@ mixinStandardHelpOptions = true, versionProvider = HddsVersionProvider.class) public class SafeModeCheckSubcommand extends ScmSubcommand { + @CommandLine.Option(names = {"--all", "-a"}, + description = "Show safe mode status for all SCM nodes in the service. " + + "When multiple SCM service IDs are configured, --service-id must be specified.") + private boolean allNodes; + + private String serviceId; + private List nodes; @Override public void execute(ScmClient scmClient) throws IOException { - boolean execReturn = scmClient.inSafeMode(); + OzoneConfiguration conf = getOzoneConf(); + serviceId = HddsUtils.getScmServiceId(conf); + String scmAddress = getScmOption().getScm(); + if (serviceId != null) { + nodes = SCMNodeInfo.buildNodeInfo(conf); + } + + if (allNodes) { + executeForAllNodes(scmClient); + } else if (StringUtils.isNotEmpty(scmAddress)) { + executeForSpecificNode(scmClient, scmAddress); + } else { + executeForSingleNode(scmClient); + } + } + + private void executeForSingleNode(ScmClient scmClient) throws IOException { + boolean inSafeMode; + Map> rules = null; + String leaderNodeId; - // Output data list - if (execReturn) { + // If SCM HA mode, query the leader node. + if (serviceId != null) { + leaderNodeId = findLeaderNodeId(scmClient); + if (leaderNodeId == null) { + throw new IOException("Unable to determine SCM leader for the service " + serviceId); + } + System.out.printf("SCM node %s%n", leaderNodeId); + inSafeMode = scmClient.inSafeModeForNode(leaderNodeId); + if (isVerbose()) { + rules = scmClient.getSafeModeRuleStatusesForNode(leaderNodeId); + } + } else { + // Non-HA mode + inSafeMode = scmClient.inSafeMode(); + if (isVerbose()) { + rules = scmClient.getSafeModeRuleStatuses(); + } + } + + if (inSafeMode) { System.out.println("SCM is in safe mode."); } else { System.out.println("SCM is out of safe mode."); } - if (isVerbose()) { - for (Map.Entry> entry : - scmClient.getSafeModeRuleStatuses().entrySet()) { - Pair value = entry.getValue(); - System.out.printf("validated:%s, %s, %s%n", - value.getLeft(), entry.getKey(), value.getRight()); + if (isVerbose() && rules != null) { + printSafeModeRules(rules); + } + } + + /** + * Find the leader node ID from SCM roles. + * @param scmClient the SCM client + * @return the leader node ID, or null if not found + */ + private String findLeaderNodeId(ScmClient scmClient) throws IOException { + try { + List roles = scmClient.getScmRoles(); + + for (String role : roles) { + String[] parts = role.split(":"); + if (parts.length < 3 || !"LEADER".equalsIgnoreCase(parts[2])) { + continue; + } + + String leaderHost = parts[0]; + String leaderIp = parts.length >= 5 ? parts[4] : null; + + for (SCMNodeInfo node : nodes) { + String nodeHost = node.getScmClientAddress().split(":")[0]; + + if (matchesAddress(leaderHost, nodeHost) || (leaderIp != null && !leaderIp.isEmpty() && + matchesAddress(leaderIp, nodeHost))) { + return node.getNodeId(); + } + } + } + + return null; + } catch (IOException e) { + throw new IOException("Could not determine leader node for service: " + serviceId, e); + } + } + + private void executeForSpecificNode(ScmClient scmClient, String scmAddress) throws IOException { + if (serviceId == null) { + executeForSingleNode(scmClient); + return; + } + + System.out.println("Service ID: " + serviceId); + // Find the node matching the --scm address + List matchedNodes = nodes.stream() + .filter(node -> matchesAddress(node.getScmClientAddress(), scmAddress)) + .collect(Collectors.toList()); + + if (matchedNodes.isEmpty()) { + throw new IOException("Specified --scm address " + scmAddress + + " does not match any node in service " + serviceId + + ". Nodes: " + nodes.stream() + .map(n -> n.getScmClientAddress() + " [" + n.getNodeId() + "]") + .collect(Collectors.joining(", "))); + } + + queryNode(scmClient, matchedNodes.get(0)); + } + + private void executeForAllNodes(ScmClient scmClient) throws IOException { + if (serviceId == null) { + executeForSingleNode(scmClient); + return; + } + + System.out.println("Service ID: " + serviceId); + + for (SCMNodeInfo node : nodes) { + queryNode(scmClient, node); + } + } + + private void queryNode(ScmClient scmClient, SCMNodeInfo node) { + String nodeId = node.getNodeId(); + + try { + boolean inSafeMode = scmClient.inSafeModeForNode(nodeId); + + System.out.printf("%s [%s]: %s%n", + node.getScmClientAddress(), + nodeId, + inSafeMode ? "IN SAFE MODE" : "OUT OF SAFE MODE"); + + if (isVerbose()) { + Map> rules = scmClient.getSafeModeRuleStatusesForNode(nodeId); + if (rules != null && !rules.isEmpty()) { + printSafeModeRules(rules); + } } + } catch (Exception e) { + System.out.printf("%s [%s]: ERROR: Failed to get safe mode status - %s%n", + node.getScmClientAddress(), nodeId, e.getMessage()); + } + } + + /** + * Check if the given SCMNodeInfo matches the target address. + * Tries to match by direct string comparison and by resolved address. + */ + private boolean matchesAddress(String address1, String address2) { + if (address1.equalsIgnoreCase(address2)) { + return true; + } + + // Normalizing both addresses and comparing + try { + InetSocketAddress addr1 = NetUtils.createSocketAddr(address1); + InetSocketAddress addr2 = NetUtils.createSocketAddr(address2); + + if (addr1.getAddress() == null || addr2.getAddress() == null) { + return false; + } + return addr1.getAddress().equals(addr2.getAddress()) && + (addr1.getPort() == 0 || addr2.getPort() == 0 || + addr1.getPort() == addr2.getPort()); + + } catch (Exception e) { + // If address resolution fails, no match + return false; + } + } + + private void printSafeModeRules(Map> rules) { + for (Map.Entry> entry : rules.entrySet()) { + Pair value = entry.getValue(); + System.out.printf("validated:%s, %s, %s%n", + value.getLeft(), entry.getKey(), value.getRight()); } } } diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmOption.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmOption.java index 640433c99b3f..8b8d4b9f0f4b 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmOption.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmOption.java @@ -88,4 +88,8 @@ public SCMSecurityProtocol createScmSecurityClient() { "Can't create SCM Security client", ex); } } + + public String getScm() { + return scm; + } } diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmSubcommand.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmSubcommand.java index 6681a4894dbe..c81a2d4c2e78 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmSubcommand.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmSubcommand.java @@ -33,6 +33,10 @@ public abstract class ScmSubcommand extends AbstractSubcommand implements Callab protected abstract void execute(ScmClient client) throws IOException; + protected ScmOption getScmOption() { + return scmOption; + } + @Override public final Void call() throws Exception { try (ScmClient scmClient = scmOption.createScmClient()) { diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestSafeModeCheckSubcommandHA.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestSafeModeCheckSubcommandHA.java new file mode 100644 index 000000000000..a200d472e6bd --- /dev/null +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestSafeModeCheckSubcommandHA.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.shell; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeoutException; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdds.scm.server.StorageContainerManager; +import org.apache.hadoop.hdds.utils.IOUtils; +import org.apache.hadoop.ozone.MiniOzoneCluster; +import org.apache.hadoop.ozone.MiniOzoneHAClusterImpl; +import org.apache.hadoop.ozone.admin.OzoneAdmin; +import org.apache.ozone.test.GenericTestUtils; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; + +/** + * Integration tests for SafeModeCheckSubcommand in HA mode. + * Tests the 'ozone admin safemode status' command with SCM HA cluster. + */ +@TestInstance(TestInstance.Lifecycle.PER_CLASS) +public class TestSafeModeCheckSubcommandHA { + private OzoneAdmin ozoneAdmin; + private MiniOzoneHAClusterImpl cluster; + private GenericTestUtils.PrintStreamCapturer out; + private GenericTestUtils.PrintStreamCapturer err; + + @BeforeAll + void init() throws IOException, InterruptedException, TimeoutException { + OzoneConfiguration conf = new OzoneConfiguration(); + cluster = MiniOzoneCluster.newHABuilder(conf) + .setOMServiceId("om-test") + .setNumOfOzoneManagers(3) + .setSCMServiceId("scm-test") + .setNumOfStorageContainerManagers(3) + .build(); + + cluster.waitForClusterToBeReady(); + } + + @BeforeEach + void setupCapture() { + out = GenericTestUtils.captureOut(); + err = GenericTestUtils.captureErr(); + ozoneAdmin = new OzoneAdmin(); + Map configOverrides = new HashMap<>(); + cluster.getConf().forEach(entry -> + configOverrides.put(entry.getKey(), entry.getValue())); + + ozoneAdmin.setConfigurationOverrides(configOverrides); + } + + @AfterEach + void stopCapture() { + IOUtils.closeQuietly(out); + IOUtils.closeQuietly(err); + } + + @Test + public void testNoOptionQueriesLeader() { + String[] args = {"safemode", "status"}; + ozoneAdmin.execute(args); + String output = out.get(); + + assertThat(output).contains(cluster.getScmLeader().getSCMNodeId()); + assertThat(output).containsPattern("SCM is (in|out of) safe mode\\."); + } + + @Test + public void testNoOptionWithVerboseShowsRules() { + String[] args = {"safemode", "status", "--verbose"}; + ozoneAdmin.execute(args); + String output = out.get(); + + assertThat(output).contains(cluster.getScmLeader().getSCMNodeId()); + assertThat(output).containsPattern("SCM is (in|out of) safe mode\\."); + assertAllSafeModeRules(output); + } + + @Test + public void testScmOptionSpecificNodeByAddress() { + // Query each SCM node individually + List scms = cluster.getStorageContainerManagers(); + String serviceId = getServiceId(); + for (StorageContainerManager scm : scms) { + String nodeId = scm.getSCMNodeId(); + String hostPort = cluster.getConf().get("ozone.scm.client.address." + serviceId + "." + nodeId); + + String[] args = {"safemode", "status", "--scm", hostPort}; + ozoneAdmin.execute(args); + String output = out.get(); + + assertThat(output).contains("Service ID: " + serviceId); + assertThat(output).contains(nodeId); + assertThat(output).containsPattern("\\[" + nodeId + "\\]: (IN|OUT OF) SAFE MODE"); + } + } + + @Test + public void testScmOptionWithVerbose() { + // Query specific scm node with verbose flag + StorageContainerManager scm = cluster.getStorageContainerManagers().get(0); + String nodeId = scm.getSCMNodeId(); + String serviceId = getServiceId(); + String hostPort = cluster.getConf().get("ozone.scm.client.address." + serviceId + "." + nodeId); + + String[] args = {"safemode", "status", "--scm", hostPort, "--verbose"}; + ozoneAdmin.execute(args); + String output = out.get(); + + assertThat(output).contains("Service ID: " + serviceId); + assertThat(output).contains(nodeId); + assertAllSafeModeRules(output); + } + + @Test + public void testAllOptionShowsAllNodes() { + // Query all nodes in the cluster + String[] args = {"safemode", "status", "--all"}; + ozoneAdmin.execute(args); + String output = out.get(); + + assertThat(output).contains("Service ID: " + getServiceId()); + assertAllScmNodes(output); + } + + @Test + public void testAllOptionWithVerboseShowsAllRules() { + // Query all nodes with verbose flag + String[] args = {"safemode", "status", "--all", "--verbose"}; + ozoneAdmin.execute(args); + String output = out.get(); + + assertThat(output).contains("Service ID: " + getServiceId()); + assertAllScmNodes(output); + assertAllSafeModeRules(output); + } + + private String getServiceId() { + return cluster.getConf().get("ozone.scm.service.ids"); + } + + private void assertAllSafeModeRules(String output) { + assertThat(output).contains("DataNodeSafeModeRule"); + assertThat(output).contains("RatisContainerSafeModeRule"); + assertThat(output).contains("HealthyPipelineSafeModeRule"); + assertThat(output).contains("StateMachineReadyRule"); + assertThat(output).contains("OneReplicaPipelineSafeModeRule"); + assertThat(output).contains("ECContainerSafeModeRule"); + } + + private void assertAllScmNodes(String output) { + List scms = cluster.getStorageContainerManagers(); + for (StorageContainerManager scm : scms) { + String nodeId = scm.getSCMNodeId(); + assertThat(output).contains(nodeId); + assertThat(output).containsPattern("\\[" + nodeId + "\\]: (IN|OUT OF) SAFE MODE"); + } + } +}