diff --git a/hadoop-hdds/interface-client/src/main/proto/hdds.proto b/hadoop-hdds/interface-client/src/main/proto/hdds.proto index eb819b80a3e8..fd908b38e6b0 100644 --- a/hadoop-hdds/interface-client/src/main/proto/hdds.proto +++ b/hadoop-hdds/interface-client/src/main/proto/hdds.proto @@ -180,7 +180,7 @@ enum NodeState { HEALTHY = 1; STALE = 2; DEAD = 3; - HEALTHY_READONLY = 6; + HEALTHY_READONLY = 6; // (deprecated) } enum NodeOperationalState { diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java index 2255fb392593..3eb4e49efd5c 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java @@ -191,17 +191,7 @@ public final class SCMEvents { * state to healthy state. */ public static final TypedEvent - HEALTHY_READONLY_TO_HEALTHY_NODE = - new TypedEvent<>(DatanodeDetails.class, - "HEALTHY_READONLY_TO_HEALTHY_NODE"); - - /** - * This event will be triggered whenever a datanode is moved to a - * healthy-readonly state. - */ - public static final TypedEvent - HEALTHY_READONLY_NODE = - new TypedEvent<>(DatanodeDetails.class, "HEALTHY_READONLY_NODE"); + UNHEALTHY_TO_HEALTHY_NODE = new TypedEvent<>(DatanodeDetails.class, "UNHEALTHY_TO_HEALTHY_NODE"); /** * This event will be triggered by CommandStatusReportHandler whenever a diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/HealthyReadOnlyNodeHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/HealthyReadOnlyNodeHandler.java deleted file mode 100644 index ed438117073a..000000000000 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/HealthyReadOnlyNodeHandler.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hdds.scm.node; - -import java.io.IOException; -import java.util.Objects; -import java.util.Set; -import org.apache.hadoop.hdds.protocol.DatanodeDetails; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos; -import org.apache.hadoop.hdds.scm.net.NetworkTopology; -import org.apache.hadoop.hdds.scm.pipeline.Pipeline; -import org.apache.hadoop.hdds.scm.pipeline.PipelineID; -import org.apache.hadoop.hdds.scm.pipeline.PipelineManager; -import org.apache.hadoop.hdds.server.events.EventHandler; -import org.apache.hadoop.hdds.server.events.EventPublisher; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Handles non healthy to healthy(ReadOnly) node event. - */ -public class HealthyReadOnlyNodeHandler - implements EventHandler { - - private static final Logger LOG = - LoggerFactory.getLogger(HealthyReadOnlyNodeHandler.class); - private final PipelineManager pipelineManager; - private final NodeManager nodeManager; - - public HealthyReadOnlyNodeHandler( - NodeManager nodeManager, PipelineManager pipelineManager) { - this.pipelineManager = pipelineManager; - this.nodeManager = nodeManager; - } - - @Override - public void onMessage(DatanodeDetails datanodeDetails, - EventPublisher publisher) { - LOG.info("Datanode {} moved to HEALTHY READONLY state.", datanodeDetails); - - /* - * Order of finalization operations should be: - * 1. SCM closes all pipelines. - * - This queues close commands for all containers to be sent to the - * datanodes. - * - Pipelines will remain in the DB until the scrubber removes them - * since we did not force close the pipelines. - * 2. SCM finalizes. - * 3. SCM moves all datanodes healthy readonly state. - * - Before this, no datanode should have been moved to healthy - * readonly, even if it heartbeated while SCM was finalizing. - * - * During the initial pipeline close phase, some containers may end up - * in the CLOSING state if they were in the process of leader election - * for their pipeline when the close command was received. A datanode - * cannot finalize with CLOSING containers, so we want to move those - * containers to CLOSE soon without waiting for the replication manager - * to do it. - * - * To do this, we will resend close commands for each pipeline. Since the - * pipelines should already be closed and we are not force closing them, no - * pipeline action is queued for the datanode. However, close container - * commands are still queued for all containers currently in the pipeline. - * The datanode will ignore these commands for CLOSED containers, but it - * allows CLOSING containers to move to CLOSED so finalization can progress. - */ - Set pipelineIDs = nodeManager.getPipelines(datanodeDetails); - for (PipelineID pipelineID : pipelineIDs) { - try { - Pipeline pipeline = pipelineManager.getPipeline(pipelineID); - LOG.info("Sending close command for pipeline {} in state {} which " + - "uses {} datanode {}. This will send close commands for its " + - "containers.", - pipelineID, pipeline.getPipelineState(), - HddsProtos.NodeState.HEALTHY_READONLY, - datanodeDetails); - pipelineManager.closePipeline(pipelineID); - } catch (IOException ex) { - LOG.error("Failed to close pipeline {} which uses HEALTHY READONLY " + - "datanode {}: ", pipelineID, datanodeDetails, ex); - } - } - - //add node back if it is not present in networkTopology - NetworkTopology nt = nodeManager.getClusterNetworkTopologyMap(); - if (!nt.contains(datanodeDetails)) { - nt.add(datanodeDetails); - // make sure after DN is added back into topology, DatanodeDetails - // instance returned from nodeStateManager has parent correctly set. - Objects.requireNonNull( - nodeManager.getNode(datanodeDetails.getID()) - .getParent(), "Parent == null"); - } - } -} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStateManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStateManager.java index fb065a0086b9..b24bebde3ae4 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStateManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStateManager.java @@ -19,12 +19,10 @@ import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DEAD; import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY; -import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY_READONLY; import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.STALE; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL; -import static org.apache.hadoop.hdds.scm.events.SCMEvents.HEALTHY_READONLY_NODE; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; @@ -56,13 +54,11 @@ import org.apache.hadoop.hdds.scm.node.states.NodeStateMap; import org.apache.hadoop.hdds.scm.pipeline.Pipeline; import org.apache.hadoop.hdds.scm.pipeline.PipelineID; -import org.apache.hadoop.hdds.scm.server.upgrade.FinalizationManager; import org.apache.hadoop.hdds.server.events.Event; import org.apache.hadoop.hdds.server.events.EventPublisher; import org.apache.hadoop.hdds.utils.HddsServerUtil; import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException; import org.apache.hadoop.ozone.common.statemachine.StateMachine; -import org.apache.hadoop.ozone.upgrade.LayoutVersionManager; import org.apache.hadoop.util.Time; import org.apache.hadoop.util.concurrent.HadoopExecutors; import org.slf4j.Logger; @@ -144,27 +140,15 @@ public class NodeStateManager implements Runnable, Closeable { */ private long skippedHealthChecks; - private LayoutVersionManager layoutVersionManager; - - /** - * Conditions to check whether a node's metadata layout version matches - * that of SCM. - */ - private final Predicate layoutMatchCondition; - private final Predicate layoutMisMatchCondition; - /** * Constructs a NodeStateManager instance with the given configuration. * * @param conf Configuration * @param eventPublisher event publisher - * @param layoutManager Layout version manager */ public NodeStateManager(ConfigurationSource conf, EventPublisher eventPublisher, - LayoutVersionManager layoutManager, SCMContext scmContext) { - this.layoutVersionManager = layoutManager; this.nodeStateMap = new NodeStateMap(); this.node2PipelineMap = new Node2PipelineMap(); this.eventPublisher = eventPublisher; @@ -194,24 +178,6 @@ public NodeStateManager(ConfigurationSource conf, skippedHealthChecks = 0; checkPaused = false; // accessed only from test functions - // This will move a datanode out of healthy readonly state if passed. - layoutMatchCondition = (layout) -> - (layout.getMetadataLayoutVersion() == - layoutVersionManager.getMetadataLayoutVersion()) && - (layout.getSoftwareLayoutVersion() == - layoutVersionManager.getSoftwareLayoutVersion()); - - // This will move a datanode in to healthy readonly state if passed. - // When SCM finishes finalizing, it will automatically move all datanodes - // to healthy readonly as well. - // If nodes heartbeat while SCM is finalizing, they should not be moved - // to healthy readonly until SCM finishes updating its MLV, hence the - // checkpoint check here. - layoutMisMatchCondition = (layout) -> - FinalizationManager.shouldTellDatanodesToFinalize( - scmContext.getFinalizationCheckpoint()) && - !layoutMatchCondition.test(layout); - scheduleNextHealthCheck(); } @@ -221,32 +187,20 @@ public NodeStateManager(ConfigurationSource conf, private void initialiseState2EventMap() { state2EventMap.put(STALE, SCMEvents.STALE_NODE); state2EventMap.put(DEAD, SCMEvents.DEAD_NODE); - state2EventMap - .put(HEALTHY, SCMEvents.HEALTHY_READONLY_TO_HEALTHY_NODE); - state2EventMap - .put(NodeState.HEALTHY_READONLY, HEALTHY_READONLY_NODE); + state2EventMap.put(HEALTHY, SCMEvents.UNHEALTHY_TO_HEALTHY_NODE); } /* * * Node and State Transition Mapping: * - * State: HEALTHY -------------------> STALE - * Event: TIMEOUT - * - * State: HEALTHY -------------------> HEALTHY_READONLY - * Event: LAYOUT_MISMATCH - * - * State: HEALTHY_READONLY -------------------> HEALTHY - * Event: LAYOUT_MATCH - * - * State: HEALTHY_READONLY -------------------> STALE + * State: HEALTHY -------------------> STALE * Event: TIMEOUT * - * State: STALE -------------------> HEALTHY_READONLY + * State: STALE -------------------> HEALTHY * Event: RESTORE * - * State: DEAD -------------------> HEALTHY_READONLY + * State: DEAD -------------------> HEALTHY * Event: RESURRECT * * State: STALE -------------------> DEAD @@ -254,39 +208,16 @@ private void initialiseState2EventMap() { * * Node State Flow * - * +-----<---------<---+ - * | (RESURRECT) | - * +-->-----(LAYOUT_MISMATCH)-->--+ V | - * | | | ^ - * | | | | - * | V V | - * | +-----(LAYOUT_MATCH)--[HEALTHY_READONLY] | - * | | ^ | | - * | | | | ^ - * | | | |(TIMEOUT) | - * ^ | (RESTORE) | | | - * | V | V | - * [HEALTHY]---->----------------->[STALE]------->--------->[DEAD] - * (TIMEOUT) (TIMEOUT) - * */ /** * Initializes the lifecycle of node state machine. */ private void initializeStateMachines() { - nodeHealthSM.addTransition(HEALTHY_READONLY, HEALTHY, - NodeLifeCycleEvent.LAYOUT_MATCH); - nodeHealthSM.addTransition(HEALTHY_READONLY, STALE, - NodeLifeCycleEvent.TIMEOUT); nodeHealthSM.addTransition(HEALTHY, STALE, NodeLifeCycleEvent.TIMEOUT); - nodeHealthSM.addTransition(HEALTHY, HEALTHY_READONLY, - NodeLifeCycleEvent.LAYOUT_MISMATCH); nodeHealthSM.addTransition(STALE, DEAD, NodeLifeCycleEvent.TIMEOUT); - nodeHealthSM.addTransition(STALE, HEALTHY_READONLY, - NodeLifeCycleEvent.RESTORE); - nodeHealthSM.addTransition(DEAD, HEALTHY_READONLY, - NodeLifeCycleEvent.RESURRECT); + nodeHealthSM.addTransition(STALE, HEALTHY, NodeLifeCycleEvent.RESTORE); + nodeHealthSM.addTransition(DEAD, HEALTHY, NodeLifeCycleEvent.RESURRECT); } /** @@ -309,7 +240,7 @@ public void addNode(DatanodeDetails datanodeDetails, } private DatanodeInfo newDatanodeInfo(DatanodeDetails datanode, LayoutVersionProto layout) { - final NodeStatus status = newNodeStatus(datanode, layout); + final NodeStatus status = newNodeStatus(datanode); return new DatanodeInfo(datanode, status, layout); } @@ -320,22 +251,15 @@ private DatanodeInfo newDatanodeInfo(DatanodeDetails datanode, LayoutVersionProt * updated to reflect the datanode state. * @param dn DatanodeDetails reported by the datanode */ - private NodeStatus newNodeStatus(DatanodeDetails dn, - LayoutVersionProto layoutInfo) { + private NodeStatus newNodeStatus(DatanodeDetails dn) { HddsProtos.NodeOperationalState dnOpState = dn.getPersistedOpState(); - NodeState state = HEALTHY; - - if (layoutMisMatchCondition.test(layoutInfo)) { - state = HEALTHY_READONLY; - } - if (dnOpState != NodeOperationalState.IN_SERVICE) { LOG.info("Updating nodeOperationalState on registration as the " + "datanode has a persisted state of {} and expiry of {}", dnOpState, dn.getPersistedOpStateExpiryEpochSec()); - return NodeStatus.valueOf(dnOpState, state, dn.getPersistedOpStateExpiryEpochSec()); + return NodeStatus.valueOf(dnOpState, HEALTHY, dn.getPersistedOpStateExpiryEpochSec()); } else { - return NodeStatus.valueOf(NodeOperationalState.IN_SERVICE, state); + return NodeStatus.valueOf(NodeOperationalState.IN_SERVICE, HEALTHY); } } @@ -755,45 +679,8 @@ public void run() { scheduleNextHealthCheck(); } - /** - * Upgrade finalization needs to move all nodes to a healthy readonly state - * when finalization finishes to make sure no nodes with metadata layout - * version older than SCM's are used in pipelines. Pipeline creation is - * still frozen at this point in the finalization flow. - * - * This method is synchronized to coordinate node state updates between - * the upgrade finalization thread which calls this method, and the - * node health processing thread that calls {@link #checkNodesHealth}. - */ - public synchronized void forceNodesToHealthyReadOnly() { - try { - List nodes = nodeStateMap.getDatanodeInfos(null, HEALTHY); - for (DatanodeInfo node : nodes) { - nodeStateMap.updateNodeHealthState(node.getID(), - HEALTHY_READONLY); - if (state2EventMap.containsKey(HEALTHY_READONLY)) { - // At this point pipeline creation is already frozen and the node's - // state has been updated in nodeStateMap. This event should be a - // no-op aside from logging a message, so it is ok to complete - // asynchronously. - eventPublisher.fireEvent(state2EventMap.get(HEALTHY_READONLY), - node); - } - } - - } catch (NodeNotFoundException ex) { - LOG.error("Inconsistent NodeStateMap! {}", nodeStateMap); - } - } - - /** - * This method is synchronized to coordinate node state updates between - * the upgrade finalization thread which calls - * {@link #forceNodesToHealthyReadOnly}, and the node health processing - * thread that calls this method. - */ @VisibleForTesting - public synchronized void checkNodesHealth() { + public void checkNodesHealth() { /* * @@ -840,16 +727,6 @@ public synchronized void checkNodesHealth() { NodeStatus status = nodeStateMap.getNodeStatus(node.getID()); switch (status.getHealth()) { case HEALTHY: - updateNodeLayoutVersionState(node, layoutMisMatchCondition, status, - NodeLifeCycleEvent.LAYOUT_MISMATCH); - // Move the node to STALE if the last heartbeat time is less than - // configured stale-node interval. - updateNodeState(node, staleNodeCondition, status, - NodeLifeCycleEvent.TIMEOUT); - break; - case HEALTHY_READONLY: - updateNodeLayoutVersionState(node, layoutMatchCondition, status, - NodeLifeCycleEvent.LAYOUT_MATCH); // Move the node to STALE if the last heartbeat time is less than // configured stale-node interval. updateNodeState(node, staleNodeCondition, status, @@ -962,37 +839,6 @@ private void fireHealthStateEvent(HddsProtos.NodeState health, } } - /** - * Updates the node state if the condition satisfies. - * - * @param node DatanodeInfo - * @param condition condition to check - * @param status current state of node - * @param lifeCycleEvent NodeLifeCycleEvent to be applied if condition - * matches - * - * @throws NodeNotFoundException if the node is not present - */ - private void updateNodeLayoutVersionState(DatanodeInfo node, - Predicate condition, - NodeStatus status, - NodeLifeCycleEvent lifeCycleEvent) - throws NodeNotFoundException { - try { - if (condition.test(node.getLastKnownLayoutVersion())) { - NodeState newHealthState = nodeHealthSM.getNextState(status.getHealth(), - lifeCycleEvent); - NodeStatus newStatus = - nodeStateMap.updateNodeHealthState(node.getID(), newHealthState); - fireHealthStateEvent(newStatus.getHealth(), node); - } - } catch (InvalidStateTransitionException e) { - LOG.warn("Invalid state transition of node {}." + - " Current state: {}, life cycle event: {}", - node, status, lifeCycleEvent); - } - } - @Override public void close() { executorService.shutdown(); @@ -1071,6 +917,6 @@ protected void removeNode(DatanodeID datanodeID) { * Node's life cycle events. */ private enum NodeLifeCycleEvent { - TIMEOUT, RESTORE, RESURRECT, LAYOUT_MISMATCH, LAYOUT_MATCH + TIMEOUT, RESTORE, RESURRECT } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStatus.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStatus.java index fab56d92c268..56ad2f175b88 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStatus.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStatus.java @@ -24,7 +24,6 @@ import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE; import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DEAD; import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY; -import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY_READONLY; import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.STALE; import java.util.Collections; @@ -64,8 +63,6 @@ public final class NodeStatus { private static final NodeStatus IN_SERVICE_HEALTHY = valueOf(IN_SERVICE, HEALTHY); - private static final NodeStatus IN_SERVICE_HEALTHY_READONLY = valueOf(IN_SERVICE, HEALTHY_READONLY); - private static final Set MAINTENANCE_STATES = Collections.unmodifiableSet( EnumSet.of(ENTERING_MAINTENANCE, IN_MAINTENANCE)); @@ -124,11 +121,6 @@ public static NodeStatus inServiceHealthy() { return IN_SERVICE_HEALTHY; } - /** @return the status of {@link NodeOperationalState#IN_SERVICE} and {@link NodeState#HEALTHY_READONLY}. */ - public static NodeStatus inServiceHealthyReadOnly() { - return IN_SERVICE_HEALTHY_READONLY; - } - /** @return the status of {@link NodeOperationalState#IN_SERVICE} and {@link NodeState#STALE}. */ public static NodeStatus inServiceStale() { return IN_SERVICE_STALE; @@ -219,10 +211,9 @@ public boolean isInMaintenance() { return operationalState == IN_MAINTENANCE; } - /** @return true iff this node is {@link NodeState#HEALTHY} or {@link NodeState#HEALTHY_READONLY}. */ + /** @return true iff this node is {@link NodeState#HEALTHY} */ public boolean isHealthy() { - return health == HEALTHY - || health == HEALTHY_READONLY; + return health == HEALTHY; } /** @return true iff this node is {@link NodeState#HEALTHY} or {@link NodeState#STALE}. */ diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java index ff561411f344..fedcb5fb17dd 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java @@ -21,7 +21,6 @@ import static org.apache.hadoop.hdds.protocol.DatanodeDetails.Port.Name.HTTPS; import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE; import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY; -import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY_READONLY; import static org.apache.hadoop.hdds.scm.SCMCommonPlacementPolicy.hasEnoughSpace; import com.google.common.annotations.VisibleForTesting; @@ -179,8 +178,7 @@ public SCMNodeManager( HDDSLayoutVersionManager layoutVersionManager, Function nodeResolver) { this.scmNodeEventPublisher = eventPublisher; - this.nodeStateManager = new NodeStateManager(conf, eventPublisher, - layoutVersionManager, scmContext); + this.nodeStateManager = new NodeStateManager(conf, eventPublisher, scmContext); this.version = VersionInfo.getLatestVersion(); this.commandQueue = new CommandQueue(); this.scmStorageConfig = scmStorageConfig; @@ -970,12 +968,9 @@ public Map getNodeStats() { final List healthyNodes = nodeStateManager .getNodes(null, HEALTHY); - final List healthyReadOnlyNodes = nodeStateManager - .getNodes(null, HEALTHY_READONLY); final List staleNodes = nodeStateManager .getStaleNodes(); final List datanodes = new ArrayList<>(healthyNodes); - datanodes.addAll(healthyReadOnlyNodes); datanodes.addAll(staleNodes); for (DatanodeInfo dnInfo : datanodes) { @@ -1113,6 +1108,11 @@ public Map> getNodeCount() { for (NodeOperationalState opState : NodeOperationalState.values()) { Map states = new HashMap<>(); for (NodeState health : NodeState.values()) { + if (health == NodeState.HEALTHY_READONLY) { + // HEALTHY_READONLY is deprecated and can no longer occur in SCM, but it cannot be removed + // from the protobuf for compatibility reasons. Skip it here to avoid confusion. + continue; + } states.put(health.name(), 0); } nodes.put(opState.name(), states); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/ReadOnlyHealthyToHealthyNodeHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/UnhealthyToHealthyNodeHandler.java similarity index 50% rename from hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/ReadOnlyHealthyToHealthyNodeHandler.java rename to hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/UnhealthyToHealthyNodeHandler.java index eba89247a196..56bda4498478 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/ReadOnlyHealthyToHealthyNodeHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/UnhealthyToHealthyNodeHandler.java @@ -17,35 +17,42 @@ package org.apache.hadoop.hdds.scm.node; +import java.util.Objects; import org.apache.hadoop.hdds.protocol.DatanodeDetails; -import org.apache.hadoop.hdds.scm.ha.SCMService.Event; +import org.apache.hadoop.hdds.scm.ha.SCMService; import org.apache.hadoop.hdds.scm.ha.SCMServiceManager; +import org.apache.hadoop.hdds.scm.net.NetworkTopology; import org.apache.hadoop.hdds.server.events.EventHandler; import org.apache.hadoop.hdds.server.events.EventPublisher; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * Handles Read Only healthy to healthy node event. (Possibly due to a - * datanode having finalized) + * Handles non healthy to healthy node event. */ -public class ReadOnlyHealthyToHealthyNodeHandler - implements EventHandler { - private static final Logger LOG = - LoggerFactory.getLogger(ReadOnlyHealthyToHealthyNodeHandler.class); +public class UnhealthyToHealthyNodeHandler implements EventHandler { + private static final Logger LOG = LoggerFactory.getLogger(UnhealthyToHealthyNodeHandler.class); private final SCMServiceManager serviceManager; + private final NodeManager nodeManager; - public ReadOnlyHealthyToHealthyNodeHandler(SCMServiceManager serviceManager) { + public UnhealthyToHealthyNodeHandler(NodeManager nodeManager, SCMServiceManager serviceManager) { this.serviceManager = serviceManager; + this.nodeManager = nodeManager; } @Override - public void onMessage(DatanodeDetails datanodeDetails, - EventPublisher publisher) { - LOG.info("Datanode {} moved to HEALTHY state.", - datanodeDetails); - serviceManager.notifyEventTriggered( - Event.UNHEALTHY_TO_HEALTHY_NODE_HANDLER_TRIGGERED); + public void onMessage(DatanodeDetails datanodeDetails, EventPublisher publisher) { + LOG.info("Datanode {} moved to HEALTHY state.", datanodeDetails); + + //add node back if it is not present in networkTopology + NetworkTopology nt = nodeManager.getClusterNetworkTopologyMap(); + if (!nt.contains(datanodeDetails)) { + nt.add(datanodeDetails); + // make sure after DN is added back into topology, DatanodeDetails + // instance returned from nodeStateManager has parent correctly set. + Objects.requireNonNull(nodeManager.getNode(datanodeDetails.getID()).getParent(), "Parent == null"); + } + serviceManager.notifyEventTriggered(SCMService.Event.UNHEALTHY_TO_HEALTHY_NODE_HANDLER_TRIGGERED); } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java index 373f39c78e49..3c4e0c7b1718 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java @@ -123,16 +123,15 @@ import org.apache.hadoop.hdds.scm.net.NetworkTopology; import org.apache.hadoop.hdds.scm.net.NetworkTopologyImpl; import org.apache.hadoop.hdds.scm.node.DeadNodeHandler; -import org.apache.hadoop.hdds.scm.node.HealthyReadOnlyNodeHandler; import org.apache.hadoop.hdds.scm.node.NewNodeHandler; import org.apache.hadoop.hdds.scm.node.NodeAddressUpdateHandler; import org.apache.hadoop.hdds.scm.node.NodeDecommissionManager; import org.apache.hadoop.hdds.scm.node.NodeManager; import org.apache.hadoop.hdds.scm.node.NodeReportHandler; -import org.apache.hadoop.hdds.scm.node.ReadOnlyHealthyToHealthyNodeHandler; import org.apache.hadoop.hdds.scm.node.SCMNodeManager; import org.apache.hadoop.hdds.scm.node.StaleNodeHandler; import org.apache.hadoop.hdds.scm.node.StartDatanodeAdminHandler; +import org.apache.hadoop.hdds.scm.node.UnhealthyToHealthyNodeHandler; import org.apache.hadoop.hdds.scm.pipeline.PipelineActionHandler; import org.apache.hadoop.hdds.scm.pipeline.PipelineManager; import org.apache.hadoop.hdds.scm.pipeline.PipelineManagerImpl; @@ -501,12 +500,8 @@ private void initializeEventHandlers() { pipelineManager, containerManager, null); StartDatanodeAdminHandler datanodeStartAdminHandler = new StartDatanodeAdminHandler(scmNodeManager, pipelineManager); - ReadOnlyHealthyToHealthyNodeHandler readOnlyHealthyToHealthyNodeHandler = - new ReadOnlyHealthyToHealthyNodeHandler(serviceManager); - HealthyReadOnlyNodeHandler - healthyReadOnlyNodeHandler = - new HealthyReadOnlyNodeHandler(scmNodeManager, - pipelineManager); + UnhealthyToHealthyNodeHandler unhealthyToHealthyNodeHandler = + new UnhealthyToHealthyNodeHandler(scmNodeManager, serviceManager); ContainerActionsHandler actionsHandler = new ContainerActionsHandler(); ContainerReportHandler containerReportHandler = @@ -586,10 +581,7 @@ private void initializeEventHandlers() { eventQueue.addHandler(SCMEvents.NODE_ADDRESS_UPDATE, nodeAddressUpdateHandler); eventQueue.addHandler(SCMEvents.STALE_NODE, staleNodeHandler); - eventQueue.addHandler(SCMEvents.HEALTHY_READONLY_TO_HEALTHY_NODE, - readOnlyHealthyToHealthyNodeHandler); - eventQueue.addHandler(SCMEvents.HEALTHY_READONLY_NODE, - healthyReadOnlyNodeHandler); + eventQueue.addHandler(SCMEvents.UNHEALTHY_TO_HEALTHY_NODE, unhealthyToHealthyNodeHandler); eventQueue.addHandler(SCMEvents.DEAD_NODE, deadNodeHandler); eventQueue.addHandler(SCMEvents.START_ADMIN_ON_NODE, datanodeStartAdminHandler); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/placement/algorithms/TestSCMContainerPlacementRackAware.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/placement/algorithms/TestSCMContainerPlacementRackAware.java index 94cdc33e869a..de3d2b8778cf 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/placement/algorithms/TestSCMContainerPlacementRackAware.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/placement/algorithms/TestSCMContainerPlacementRackAware.java @@ -877,11 +877,9 @@ public void chooseNodeWithUsedAndFavouredNodesMultipleRack() @Test public void testSourceDatanodeIsNotChosenAsTarget() { - setup(2); + setup(1); List usedNodes = new ArrayList<>(); usedNodes.add(datanodes.get(0)); - dnInfos.get(1).setNodeStatus(NodeStatus.inServiceHealthyReadOnly()); - assertThrows(SCMException.class, () -> policy.chooseDatanodes(usedNodes, null, null, 1, 0, 0), "No target datanode, this call should fail"); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java index 138a848dfa38..7bdfefa067ac 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java @@ -83,7 +83,7 @@ public class TestDeadNodeHandler { private ContainerManager containerManager; private PipelineManagerImpl pipelineManager; private DeadNodeHandler deadNodeHandler; - private HealthyReadOnlyNodeHandler healthyReadOnlyNodeHandler; + private UnhealthyToHealthyNodeHandler unhealthyToHealthyNodeHandler; private EventPublisher publisher; @TempDir private File storageDir; @@ -118,9 +118,7 @@ public void setup() throws IOException, AuthenticationException { deletedBlockLog = mock(DeletedBlockLog.class); deadNodeHandler = new DeadNodeHandler(nodeManager, mock(PipelineManager.class), containerManager, deletedBlockLog); - healthyReadOnlyNodeHandler = - new HealthyReadOnlyNodeHandler(nodeManager, - pipelineManager); + unhealthyToHealthyNodeHandler = new UnhealthyToHealthyNodeHandler(nodeManager, scm.getSCMServiceManager()); eventQueue.addHandler(SCMEvents.DEAD_NODE, deadNodeHandler); publisher = mock(EventPublisher.class); } @@ -284,7 +282,7 @@ public void testOnMessage(@TempDir File tempDir) throws Exception { assertEquals(datanode3, container3Replicas.iterator().next().getDatanodeDetails()); //datanode will be added back to ClusterNetworkTopology if it resurrects - healthyReadOnlyNodeHandler.onMessage(datanode1, publisher); + unhealthyToHealthyNodeHandler.onMessage(datanode1, publisher); assertTrue( nodeManager.getClusterNetworkTopologyMap().contains(datanode1)); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeStateManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeStateManager.java index 0f536b4b01cc..65f00730230b 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeStateManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeStateManager.java @@ -17,13 +17,10 @@ package org.apache.hadoop.hdds.scm.node; -import static org.apache.hadoop.hdds.upgrade.HDDSLayoutVersionManager.maxLayoutVersion; import static org.apache.hadoop.ozone.container.upgrade.UpgradeUtils.defaultLayoutVersionProto; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; import java.io.IOException; import java.util.ArrayList; @@ -44,15 +41,11 @@ import org.apache.hadoop.hdds.scm.server.upgrade.FinalizationCheckpoint; import org.apache.hadoop.hdds.server.events.Event; import org.apache.hadoop.hdds.server.events.EventPublisher; -import org.apache.hadoop.hdds.upgrade.HDDSLayoutVersionManager; import org.apache.hadoop.hdds.utils.HddsServerUtil; import org.apache.hadoop.ozone.container.upgrade.UpgradeUtils; -import org.apache.hadoop.ozone.upgrade.LayoutVersionManager; import org.apache.hadoop.util.Time; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Class to test the NodeStateManager, which is an internal class used by @@ -60,15 +53,9 @@ */ public class TestNodeStateManager { - private static final Logger LOG = - LoggerFactory.getLogger(TestNodeStateManager.class); - private NodeStateManager nsm; private ConfigurationSource conf; private MockEventPublisher eventPublisher; - private SCMContext scmContext; - private int scmSlv; - private int scmMlv; @BeforeEach public void setUp() { @@ -90,17 +77,11 @@ public char[] getPassword(String key) throws IOException { }; // Make NodeStateManager behave as if SCM has completed finalization, // unless a test changes the value of this variable. - scmContext = SCMContext.emptyContext(); + SCMContext scmContext = SCMContext.emptyContext(); scmContext.setFinalizationCheckpoint( FinalizationCheckpoint.FINALIZATION_COMPLETE); eventPublisher = new MockEventPublisher(); - scmSlv = maxLayoutVersion(); - scmMlv = maxLayoutVersion(); - LayoutVersionManager mockVersionManager = mock(HDDSLayoutVersionManager.class); - when(mockVersionManager.getMetadataLayoutVersion()).thenReturn(scmMlv); - when(mockVersionManager.getSoftwareLayoutVersion()).thenReturn(scmSlv); - nsm = new NodeStateManager(conf, eventPublisher, mockVersionManager, - scmContext); + nsm = new NodeStateManager(conf, eventPublisher, scmContext); } @Test @@ -209,11 +190,11 @@ public void testNodeCanTransitionThroughHealthStatesAndFiresEvents() assertEquals(NodeState.DEAD, nsm.getNodeStatus(dn).getHealth()); assertEquals(SCMEvents.DEAD_NODE, eventPublisher.getLastEvent()); - // Transition to healthy readonly from dead + // Transition to healthy from dead dni.updateLastHeartbeatTime(); nsm.checkNodesHealth(); - assertEquals(NodeState.HEALTHY_READONLY, nsm.getNodeStatus(dn).getHealth()); - assertEquals(SCMEvents.HEALTHY_READONLY_NODE, eventPublisher.getLastEvent()); + assertEquals(NodeState.HEALTHY, nsm.getNodeStatus(dn).getHealth()); + assertEquals(SCMEvents.UNHEALTHY_TO_HEALTHY_NODE, eventPublisher.getLastEvent()); // Make the node stale again, and transition to healthy. dni.updateLastHeartbeatTime(now - staleLimit); @@ -222,36 +203,9 @@ public void testNodeCanTransitionThroughHealthStatesAndFiresEvents() assertEquals(SCMEvents.STALE_NODE, eventPublisher.getLastEvent()); dni.updateLastHeartbeatTime(); nsm.checkNodesHealth(); - assertEquals(NodeState.HEALTHY_READONLY, nsm.getNodeStatus(dn).getHealth()); - assertEquals(SCMEvents.HEALTHY_READONLY_NODE, eventPublisher.getLastEvent()); - - // Another health check run should move the node to healthy since its - // metadata layout version matches SCM's. - nsm.checkNodesHealth(); assertEquals(NodeState.HEALTHY, nsm.getNodeStatus(dn).getHealth()); - assertEquals(SCMEvents.HEALTHY_READONLY_TO_HEALTHY_NODE, eventPublisher.getLastEvent()); + assertEquals(SCMEvents.UNHEALTHY_TO_HEALTHY_NODE, eventPublisher.getLastEvent()); eventPublisher.clearEvents(); - - // Test how node state manager handles datanodes with lower metadata - // layout version based on SCM's finalization checkpoint. - dni.updateLastKnownLayoutVersion( - UpgradeUtils.toLayoutVersionProto(scmMlv - 1, scmSlv)); - for (FinalizationCheckpoint checkpoint: FinalizationCheckpoint.values()) { - scmContext.setFinalizationCheckpoint(checkpoint); - LOG.info("Testing datanode state from current SCM finalization " + - "checkpoint: {}", checkpoint); - nsm.checkNodesHealth(); - - // Datanodes should not be moved to healthy readonly until the SCM has - // finished updating its metadata layout version as part of finalization. - if (checkpoint.hasCrossed(FinalizationCheckpoint.MLV_EQUALS_SLV)) { - assertEquals(NodeState.HEALTHY_READONLY, nsm.getNodeStatus(dn).getHealth()); - assertEquals(SCMEvents.HEALTHY_READONLY_NODE, eventPublisher.getLastEvent()); - } else { - assertEquals(NodeState.HEALTHY, nsm.getNodeStatus(dn).getHealth()); - assertNull(eventPublisher.getLastEvent()); - } - } } @Test @@ -296,14 +250,14 @@ public void testHealthEventsFiredWhenOpStateChanged() nsm.addNode(dn, UpgradeUtils.defaultLayoutVersionProto()); // First set the node to decommissioned, then run through all op states in - // order and ensure the healthy_to_healthy_readonly event gets fired + // order and ensure the unhealthy_to_healthy event gets fired nsm.setNodeOperationalState(dn, HddsProtos.NodeOperationalState.DECOMMISSIONED); for (HddsProtos.NodeOperationalState s : HddsProtos.NodeOperationalState.values()) { eventPublisher.clearEvents(); nsm.setNodeOperationalState(dn, s); - assertEquals(SCMEvents.HEALTHY_READONLY_TO_HEALTHY_NODE, eventPublisher.getLastEvent()); + assertEquals(SCMEvents.UNHEALTHY_TO_HEALTHY_NODE, eventPublisher.getLastEvent()); } // Now make the node stale and run through all states again ensuring the diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeManager.java index 59a4938e8d76..0e6a1ff41426 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeManager.java @@ -57,7 +57,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Set; @@ -70,7 +69,6 @@ import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.hdds.HddsConfigKeys; import org.apache.hadoop.hdds.client.RatisReplicationConfig; -import org.apache.hadoop.hdds.client.ReplicationConfig; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.fs.SpaceUsageSource; import org.apache.hadoop.hdds.protocol.DatanodeDetails; @@ -88,7 +86,6 @@ import org.apache.hadoop.hdds.scm.ScmConfigKeys; import org.apache.hadoop.hdds.scm.container.placement.metrics.SCMNodeStat; import org.apache.hadoop.hdds.scm.events.SCMEvents; -import org.apache.hadoop.hdds.scm.exceptions.SCMException; import org.apache.hadoop.hdds.scm.ha.SCMContext; import org.apache.hadoop.hdds.scm.net.NetworkTopology; import org.apache.hadoop.hdds.scm.net.NetworkTopologyImpl; @@ -268,40 +265,6 @@ public void testGetLastHeartbeatTimeDiff() throws Exception { } } - /** - * Tests that node manager handles layout version changes from heartbeats - * correctly. - * - * @throws IOException - * @throws InterruptedException - * @throws TimeoutException - */ - @Test - public void testScmLayoutOnHeartbeat() throws Exception { - OzoneConfiguration conf = getConf(); - conf.setTimeDuration(ScmConfigKeys.OZONE_SCM_PIPELINE_CREATION_INTERVAL, - 1, TimeUnit.DAYS); - - try (SCMNodeManager nodeManager = createNodeManager(conf)) { - assertTrue(scm.getScmContext().isLeader()); - // Register 2 nodes correctly. - // These will be used with a faulty node to test pipeline creation. - DatanodeDetails goodNode1 = registerWithCapacity(nodeManager); - DatanodeDetails goodNode2 = registerWithCapacity(nodeManager); - - scm.exitSafeMode(); - - assertPipelineClosedAfterLayoutHeartbeat(goodNode1, goodNode2, - nodeManager, SMALLER_MLV_LAYOUT_PROTO); - assertPipelineClosedAfterLayoutHeartbeat(goodNode1, goodNode2, - nodeManager, LARGER_MLV_SLV_LAYOUT_PROTO); - assertPipelineClosedAfterLayoutHeartbeat(goodNode1, goodNode2, - nodeManager, SMALLER_MLV_SLV_LAYOUT_PROTO); - assertPipelineClosedAfterLayoutHeartbeat(goodNode1, goodNode2, - nodeManager, LARGER_SLV_LAYOUT_PROTO); - } - } - /** * Create {@link DatanodeDetails} to register with {@code nodeManager}, and * provide the datanode maximum capacity so that space used does not block @@ -341,50 +304,6 @@ private DatanodeDetails registerWithCapacity(SCMNodeManager nodeManager, return cmd.getDatanode(); } - private void assertPipelineClosedAfterLayoutHeartbeat( - DatanodeDetails originalNode1, DatanodeDetails originalNode2, - SCMNodeManager nodeManager, LayoutVersionProto layout) throws Exception { - - List originalNodes = - Arrays.asList(originalNode1, originalNode2); - - // Initial condition: 2 healthy nodes registered. - assertPipelines(HddsProtos.ReplicationFactor.ONE, count -> count == 2, - originalNodes); - assertPipelines(HddsProtos.ReplicationFactor.THREE, - count -> count == 0, new ArrayList<>()); - - // Even when safemode exit or new node addition trigger pipeline - // creation, they will fail with not enough healthy nodes for ratis 3 - // pipeline. Therefore we do not have to worry about this create call - // failing due to datanodes reaching their maximum pipeline limit. - assertPipelineCreationFailsWithNotEnoughNodes(2); - - // Register a new node correctly. - DatanodeDetails node = registerWithCapacity(nodeManager); - - List allNodes = new ArrayList<>(originalNodes); - allNodes.add(node); - - // Safemode exit and adding the new node should trigger pipeline creation. - assertPipelines(HddsProtos.ReplicationFactor.ONE, count -> count == 3, - allNodes); - assertPipelines(HddsProtos.ReplicationFactor.THREE, count -> count >= 1, - allNodes); - - // node sends incorrect layout. - nodeManager.processLayoutVersionReport(node, layout); - - // Its pipelines should be closed then removed, meaning there is not - // enough nodes for factor 3 pipelines. - assertPipelines(HddsProtos.ReplicationFactor.ONE, count -> count == 2, - originalNodes); - assertPipelines(HddsProtos.ReplicationFactor.THREE, - count -> count == 0, new ArrayList<>()); - - assertPipelineCreationFailsWithNotEnoughNodes(2); - } - /** * Tests that node manager handles layout versions for newly registered nodes * correctly. @@ -410,14 +329,12 @@ public void testScmLayoutOnRegister() SMALLER_MLV_SLV_LAYOUT_PROTO, errorNodeNotPermitted); registerWithCapacity(nodeManager, LARGER_MLV_SLV_LAYOUT_PROTO, errorNodeNotPermitted); - // Nodes with mismatched MLV can join, but should not be allowed in - // pipelines. + // Nodes with mismatched MLV can join DatanodeDetails badMlvNode1 = registerWithCapacity(nodeManager, SMALLER_MLV_LAYOUT_PROTO, success); DatanodeDetails badMlvNode2 = registerWithCapacity(nodeManager, SMALLER_MLV_LAYOUT_PROTO, success); - // This node has correct MLV and SLV, so it can join and be used in - // pipelines. + // This node has correct MLV and SLV DatanodeDetails goodNode = registerWithCapacity(nodeManager, CORRECT_LAYOUT_PROTO, success); @@ -425,29 +342,6 @@ public void testScmLayoutOnRegister() scm.exitSafeMode(); - // SCM should auto create a factor 1 pipeline for the one healthy node. - // Still should not have enough healthy nodes for ratis 3 pipeline. - assertPipelines(HddsProtos.ReplicationFactor.ONE, - count -> count == 1, - Collections.singletonList(goodNode)); - assertPipelines(HddsProtos.ReplicationFactor.THREE, - count -> count == 0, - new ArrayList<>()); - - // Even when safemode exit or new node addition trigger pipeline - // creation, they will fail with not enough healthy nodes for ratis 3 - // pipeline. Therefore we do not have to worry about this create call - // failing due to datanodes reaching their maximum pipeline limit. - assertPipelineCreationFailsWithExceedingLimit(2); - - // Heartbeat bad MLV nodes back to healthy. - nodeManager.processLayoutVersionReport(badMlvNode1, CORRECT_LAYOUT_PROTO); - nodeManager.processLayoutVersionReport(badMlvNode2, CORRECT_LAYOUT_PROTO); - nodeManager.processHeartbeat(badMlvNode1); - nodeManager.processHeartbeat(badMlvNode2); - - // After moving out of healthy readonly, pipeline creation should be - // triggered. assertPipelines(HddsProtos.ReplicationFactor.ONE, count -> count == 3, Arrays.asList(badMlvNode1, badMlvNode2, goodNode)); @@ -457,32 +351,6 @@ public void testScmLayoutOnRegister() } } - private void assertPipelineCreationFailsWithNotEnoughNodes( - int actualNodeCount) throws Exception { - SCMException ex = assertThrows(SCMException.class, () -> { - ReplicationConfig ratisThree = - ReplicationConfig.fromProtoTypeAndFactor( - HddsProtos.ReplicationType.RATIS, - HddsProtos.ReplicationFactor.THREE); - scm.getPipelineManager().createPipeline(ratisThree); - }, "3 nodes should not have been found for a pipeline."); - assertThat(ex.getMessage()).contains("Required 3. Found " + - actualNodeCount); - } - - private void assertPipelineCreationFailsWithExceedingLimit(int limit) { - // Build once, outside the assertion - ReplicationConfig config = ReplicationConfig.fromProtoTypeAndFactor( - HddsProtos.ReplicationType.RATIS, - HddsProtos.ReplicationFactor.THREE); - SCMException ex = assertThrows( - SCMException.class, - () -> scm.getPipelineManager().createPipeline(config), - "3 nodes should not have been found for a pipeline."); - assertThat(ex.getMessage()) - .contains("Cannot create pipeline as it would exceed the limit per datanode: " + limit); - } - private void assertPipelines(HddsProtos.ReplicationFactor factor, Predicate countCheck, Collection allowedDNs) throws Exception { diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeMetrics.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeMetrics.java index ac2c1e4c51eb..5a5a9c594a73 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeMetrics.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestSCMNodeMetrics.java @@ -174,8 +174,6 @@ public void testNodeCountAndInfoMetricsReported() throws Exception { assertGauge("InServiceHealthyNodes", 1, getMetrics(SCMNodeMetrics.class.getSimpleName())); - assertGauge("InServiceHealthyReadonlyNodes", 0, - getMetrics(SCMNodeMetrics.class.getSimpleName())); assertGauge("InServiceStaleNodes", 0, getMetrics(SCMNodeMetrics.class.getSimpleName())); assertGauge("InServiceDeadNodes", 0, @@ -246,7 +244,6 @@ public void testNodeCountAndInfoMetricsReported() throws Exception { nodeManager.processHeartbeat(registeredDatanode); sleep(4000); metricsSource = getMetrics(SCMNodeMetrics.SOURCE_NAME); - assertGauge("InServiceHealthyReadonlyNodes", 0, metricsSource); assertGauge("InServiceHealthyNodes", 1, metricsSource); } diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestCreateForReadComparator.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestCreateForReadComparator.java index f7b561a45b1e..43a88580e4c4 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestCreateForReadComparator.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestCreateForReadComparator.java @@ -20,19 +20,14 @@ import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.DECOMMISSIONING; import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.ENTERING_MAINTENANCE; import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY; -import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY_READONLY; import static org.apache.hadoop.hdds.scm.node.NodeStatus.inServiceDead; import static org.apache.hadoop.hdds.scm.node.NodeStatus.inServiceHealthy; import static org.apache.hadoop.hdds.scm.node.NodeStatus.inServiceStale; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.Comparator; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.scm.node.NodeStatus; import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; /** * Tests for {@link ECPipelineProvider#CREATE_FOR_READ_COMPARATOR}. @@ -44,14 +39,6 @@ int compare(NodeStatus left, NodeStatus right) { return comparator.compare(left, right); } - @ParameterizedTest - @EnumSource - void readOnly(HddsProtos.NodeOperationalState state) { - assertEquals(0, compare( - NodeStatus.valueOf(state, HEALTHY), - NodeStatus.valueOf(state, HEALTHY_READONLY))); - } - @Test void healthyFirst() { assertThat(0).isGreaterThan(compare(inServiceHealthy(), inServiceStale())); diff --git a/hadoop-ozone/cli-admin/src/test/java/org/apache/hadoop/hdds/scm/cli/datanode/TestListInfoSubcommand.java b/hadoop-ozone/cli-admin/src/test/java/org/apache/hadoop/hdds/scm/cli/datanode/TestListInfoSubcommand.java index 13ae6a35f10d..d934bbefaeb0 100644 --- a/hadoop-ozone/cli-admin/src/test/java/org/apache/hadoop/hdds/scm/cli/datanode/TestListInfoSubcommand.java +++ b/hadoop-ozone/cli-admin/src/test/java/org/apache/hadoop/hdds/scm/cli/datanode/TestListInfoSubcommand.java @@ -100,14 +100,16 @@ public void testDataNodeOperationalStateAndHealthIncludedInOutput() m = p.matcher(outContent.toString(DEFAULT_ENCODING)); assertTrue(m.find()); for (HddsProtos.NodeState state : HddsProtos.NodeState.values()) { + if (state == HddsProtos.NodeState.HEALTHY_READONLY) { + continue; // HEALTHY_READONLY is no longer a valid state and the protobuf definition is deprecated. + } p = Pattern.compile( "^Health State:\\s+" + state + "$", Pattern.MULTILINE); m = p.matcher(outContent.toString(DEFAULT_ENCODING)); assertTrue(m.find()); } - // Ensure the nodes are ordered by health state HEALTHY, - // HEALTHY_READONLY, STALE, DEAD - p = Pattern.compile(".+HEALTHY.+STALE.+DEAD.+HEALTHY_READONLY.+", + // Ensure the nodes are ordered by health state HEALTHY, STALE, DEAD + p = Pattern.compile(".+HEALTHY.+STALE.+DEAD.+", Pattern.DOTALL); m = p.matcher(outContent.toString(DEFAULT_ENCODING)); @@ -148,12 +150,15 @@ public void testDataNodeOperationalStateAndHealthIncludedInOutput() // Check all expected health states are present for (HddsProtos.NodeState state : HddsProtos.NodeState.values()) { + if (state == HddsProtos.NodeState.HEALTHY_READONLY) { + continue; // HEALTHY_READONLY is no longer a valid state and the protobuf definition is deprecated. + } assertTrue(healthStates.contains(state.toString()), "Expected health state: " + state + " but not found"); } - // Check order: HEALTHY -> STALE -> DEAD -> HEALTHY_READONLY - List expectedOrder = Arrays.asList("HEALTHY", "STALE", "DEAD", "HEALTHY_READONLY"); + // Check order: HEALTHY -> STALE -> DEAD + List expectedOrder = Arrays.asList("HEALTHY", "STALE", "DEAD"); int lastIndex = -1; for (String state : healthStates) { int index = expectedOrder.indexOf(state); @@ -420,10 +425,6 @@ private List getNodeDetails() { builder.addNodeOperationalStates( HddsProtos.NodeOperationalState.DECOMMISSIONING); builder.addNodeStates(HddsProtos.NodeState.DEAD); - } else if (i == 2) { - builder.addNodeOperationalStates( - HddsProtos.NodeOperationalState.IN_SERVICE); - builder.addNodeStates(HddsProtos.NodeState.HEALTHY_READONLY); } else { builder.addNodeOperationalStates( HddsProtos.NodeOperationalState.IN_SERVICE); diff --git a/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - Overall Metrics.json b/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - Overall Metrics.json index 825fc0a257dd..dd2c443409c8 100644 --- a/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - Overall Metrics.json +++ b/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - Overall Metrics.json @@ -2585,18 +2585,13 @@ "text": "healthy", "value": "healthy" }, - { - "selected": false, - "text": "healthy_readonly", - "value": "healthy_readonly" - }, { "selected": false, "text": "stale", "value": "stale" } ], - "query": "dead,healthy,healthy_readonly,stale", + "query": "dead,healthy,stale", "queryValue": "", "skipUrlSync": false, "type": "custom" diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/upgrade/TestHDDSUpgrade.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/upgrade/TestHDDSUpgrade.java index 4b612729583d..fe9eeee386c3 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/upgrade/TestHDDSUpgrade.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/upgrade/TestHDDSUpgrade.java @@ -23,7 +23,6 @@ import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_PIPELINE_REPORT_INTERVAL; import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.CLOSED; import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY; -import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY_READONLY; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_DATANODE_PIPELINE_LIMIT; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_RATIS_PIPELINE_LIMIT; @@ -312,10 +311,7 @@ public void testFinalizationFromInitialVersionToLatestVersion() cluster.getStorageContainerManagersList(), NUM_CONTAINERS_CREATED, NUM_DATA_NODES); - // All datanodes on the SCM should have moved to HEALTHY-READONLY state. - TestHddsUpgradeUtils.testDataNodesStateOnSCM( - cluster.getStorageContainerManagersList(), NUM_DATA_NODES, - HEALTHY_READONLY, HEALTHY); + TestHddsUpgradeUtils.testDataNodesStateOnSCM(cluster.getStorageContainerManagersList(), NUM_DATA_NODES, HEALTHY); // Verify the SCM has driven all the DataNodes through Layout Upgrade. // In the happy path case, no containers should have been quasi closed as @@ -872,16 +868,14 @@ public void testFinalizationWithFailureInjectionHelper( // Due to timing constraint also allow a "HEALTHY" state. loadSCMState(); TestHddsUpgradeUtils.testDataNodesStateOnSCM( - cluster.getStorageContainerManagersList(), NUM_DATA_NODES, - HEALTHY_READONLY, HEALTHY); + cluster.getStorageContainerManagersList(), NUM_DATA_NODES, HEALTHY); // Need to wait for post finalization heartbeat from DNs. LambdaTestUtils.await(600000, 500, () -> { try { loadSCMState(); TestHddsUpgradeUtils.testDataNodesStateOnSCM( - cluster.getStorageContainerManagersList(), NUM_DATA_NODES, - HEALTHY, null); + cluster.getStorageContainerManagersList(), NUM_DATA_NODES, HEALTHY); sleep(100); } catch (Throwable ex) { LOG.info(ex.getMessage()); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/upgrade/TestHddsUpgradeUtils.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/upgrade/TestHddsUpgradeUtils.java index facbe3f44d12..12fde9280247 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/upgrade/TestHddsUpgradeUtils.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/upgrade/TestHddsUpgradeUtils.java @@ -17,12 +17,11 @@ package org.apache.hadoop.hdds.upgrade; -import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY; -import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY_READONLY; import static org.apache.hadoop.ozone.upgrade.UpgradeFinalization.Status.ALREADY_FINALIZED; import static org.apache.hadoop.ozone.upgrade.UpgradeFinalization.Status.FINALIZATION_DONE; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertSame; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; @@ -105,11 +104,6 @@ public static void testPostUpgradeConditionsSCM(StorageContainerManager scm, scmVersionManager.getMetadataLayoutVersion()); assertThat(scmVersionManager.getMetadataLayoutVersion()).isGreaterThanOrEqualTo(1); - // SCM will not return from finalization until all HEALTHY datanodes - // have completed their finalization (MLV == SLV). This ensures datanodes - // are ready to serve requests even though containers may remain OPEN. - testDataNodesStateOnSCM(scm, numDatanodes, HEALTHY, HEALTHY_READONLY); - int countContainers = scm.getContainerManager().getContainers().size(); assertThat(countContainers).isGreaterThanOrEqualTo(numContainers); } @@ -174,10 +168,8 @@ public static void testPostUpgradeConditionsDataNodes( } public static void testDataNodesStateOnSCM(List scms, - int expectedDatanodeCount, HddsProtos.NodeState state, - HddsProtos.NodeState alternateState) { - scms.forEach(scm -> testDataNodesStateOnSCM(scm, expectedDatanodeCount, - state, alternateState)); + int expectedDatanodeCount, HddsProtos.NodeState state) { + scms.forEach(scm -> testDataNodesStateOnSCM(scm, expectedDatanodeCount, state)); } /* @@ -188,15 +180,13 @@ public static void testDataNodesStateOnSCM(List scms, * setting "alternateState = null". */ public static void testDataNodesStateOnSCM(StorageContainerManager scm, - int expectedDatanodeCount, HddsProtos.NodeState state, - HddsProtos.NodeState alternateState) { + int expectedDatanodeCount, HddsProtos.NodeState state) { int countNodes = 0; for (DatanodeDetails dn : scm.getScmNodeManager().getAllNodes()) { try { HddsProtos.NodeState dnState = scm.getScmNodeManager().getNodeStatus(dn).getHealth(); - assertTrue((dnState == state) || - (alternateState != null && dnState == alternateState)); + assertSame(state, dnState); } catch (NodeNotFoundException e) { e.printStackTrace(); fail("Node not found"); diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/ClusterStateEndpoint.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/ClusterStateEndpoint.java index 05037e3166f9..0c969430eca5 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/ClusterStateEndpoint.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/ClusterStateEndpoint.java @@ -118,8 +118,7 @@ public Response getClusterState() { HddsProtos.LifeCycleState.DELETED)); int healthyDataNodes = - nodeManager.getNodeCount(NodeStatus.inServiceHealthy()) + - nodeManager.getNodeCount(NodeStatus.inServiceHealthyReadOnly()); + nodeManager.getNodeCount(NodeStatus.inServiceHealthy()); SCMNodeStat stats = nodeManager.getStats(); long fsCapacity = 0; diff --git a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/api/TestClusterStateEndpoint.java b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/api/TestClusterStateEndpoint.java index 6e187095c92a..f8806220399c 100644 --- a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/api/TestClusterStateEndpoint.java +++ b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/api/TestClusterStateEndpoint.java @@ -201,8 +201,6 @@ public void testStorageReportIsClusterStorageReport() { 1000L, 400L, 600L, 300L, 50L, 20L); when(mockNodeManager.getStats()).thenReturn(scmNodeStat); when(mockNodeManager.getNodeCount(NodeStatus.inServiceHealthy())).thenReturn(1); - when(mockNodeManager.getNodeCount(NodeStatus.inServiceHealthyReadOnly())) - .thenReturn(0); when(mockNodeManager.getAllNodeCount()).thenReturn(1); when(mockNodeManager.getAllNodes()).thenReturn(Collections.singletonList(mockDatanode)); when(mockNodeManager.getTotalFilesystemUsage(mockDatanode))