Skip to content

Commit e97cff3

Browse files
Management Server - Prepare for Maintenance and Cancel Maintenance improvements:
- Added new setting 'management.server.maintenance.ignore.maintenance.hosts' to ignore hosts in maintenance states while preparing management server for maintenance. This skips agent transfer and agents count check for hosts in maintenance. - Rebalance indirect agents after cancel maintenance, using rebalance parameter in cancelMaintenance API - Force maintenance after maintenance window timeout, using forced parameter in prepareForMaintenance API. - Propagate 'indirect.agent.lb.check.interval' setting change to the host agents.
1 parent f496ed6 commit e97cff3

File tree

25 files changed

+308
-94
lines changed

25 files changed

+308
-94
lines changed

agent/src/main/java/com/cloud/agent/Agent.java

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -928,7 +928,7 @@ private Answer setupAgentCertificate(final SetupCertificateCommand cmd) {
928928
return new SetupCertificateAnswer(true);
929929
}
930930

931-
private void processManagementServerList(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval) {
931+
private void processManagementServerList(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval, final boolean triggerHostLB) {
932932
if (CollectionUtils.isNotEmpty(msList) && StringUtils.isNotEmpty(lbAlgorithm)) {
933933
try {
934934
final String newMSHosts = String.format("%s%s%s", com.cloud.utils.StringUtils.toCSVList(msList), IAgentShell.hostLbAlgorithmSeparator, lbAlgorithm);
@@ -941,6 +941,12 @@ private void processManagementServerList(final List<String> msList, final List<S
941941
}
942942
}
943943
shell.setAvoidHosts(avoidMsList);
944+
if (triggerHostLB) {
945+
logger.info("Triggering preferred host task");
946+
hostLbCheckExecutor = Executors.newSingleThreadScheduledExecutor((new NamedThreadFactory("HostLB-Executor")));
947+
ScheduledExecutorService hostLbExecutor = Executors.newScheduledThreadPool(1);
948+
hostLbExecutor.schedule(new PreferredHostCheckerTask(), 0, TimeUnit.MILLISECONDS);
949+
}
944950
if ("shuffle".equals(lbAlgorithm)) {
945951
scheduleHostLBCheckerTask(0);
946952
} else {
@@ -949,14 +955,14 @@ private void processManagementServerList(final List<String> msList, final List<S
949955
}
950956

951957
private Answer setupManagementServerList(final SetupMSListCommand cmd) {
952-
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
958+
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval(), cmd.getTriggerHostLb());
953959
return new SetupMSListAnswer(true);
954960
}
955961

956962
private Answer migrateAgentToOtherMS(final MigrateAgentConnectionCommand cmd) {
957963
try {
958964
if (CollectionUtils.isNotEmpty(cmd.getMsList())) {
959-
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
965+
processManagementServerList(cmd.getMsList(), cmd.getAvoidMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval(), false);
960966
}
961967
Executors.newSingleThreadScheduledExecutor(new NamedThreadFactory("MigrateAgentConnection-Job")).schedule(() -> {
962968
migrateAgentConnection(cmd.getAvoidMsList());
@@ -1046,7 +1052,7 @@ public void processReadyCommand(final Command cmd) {
10461052
}
10471053

10481054
verifyAgentArch(ready.getArch());
1049-
processManagementServerList(ready.getMsHostList(), ready.getAvoidMsHostList(), ready.getLbAlgorithm(), ready.getLbCheckInterval());
1055+
processManagementServerList(ready.getMsHostList(), ready.getAvoidMsHostList(), ready.getLbAlgorithm(), ready.getLbCheckInterval(), false);
10501056

10511057
logger.info("Ready command is processed for agent [id: {}, uuid: {}, name: {}]", getId(), getUuid(), getName());
10521058
}

api/src/main/java/com/cloud/exception/OperationTimedoutException.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ public class OperationTimedoutException extends CloudException {
4040
boolean _isActive;
4141

4242
public OperationTimedoutException(Command[] cmds, long agentId, long seqId, int time, boolean isActive) {
43-
super("Commands " + seqId + " to Host " + agentId + " timed out after " + time);
43+
super("Commands " + seqId + " to Host " + agentId + " timed out after " + time + " secs");
4444
_agentId = agentId;
4545
_seqId = seqId;
4646
_time = time;

api/src/main/java/com/cloud/resource/ResourceState.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ public static Event toEvent(String e) {
7676
}
7777
}
7878

79+
public static List<ResourceState> s_maintenanceStates = List.of(ResourceState.Maintenance,
80+
ResourceState.ErrorInMaintenance, ResourceState.PrepareForMaintenance,
81+
ResourceState.ErrorInPrepareForMaintenance);
82+
7983
public ResourceState getNextState(Event a) {
8084
return s_fsm.getNextState(this, a);
8185
}
@@ -98,8 +102,7 @@ public static String[] toString(ResourceState... states) {
98102
}
99103

100104
public static boolean isMaintenanceState(ResourceState state) {
101-
return Arrays.asList(ResourceState.Maintenance, ResourceState.ErrorInMaintenance,
102-
ResourceState.PrepareForMaintenance, ResourceState.ErrorInPrepareForMaintenance).contains(state);
105+
return s_maintenanceStates.contains(state);
103106
}
104107

105108
public static boolean canAttemptMaintenance(ResourceState state) {

api/src/main/java/org/apache/cloudstack/api/ApiConstants.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,7 @@ public class ApiConstants {
427427
public static final String PUBLIC_END_PORT = "publicendport";
428428
public static final String PUBLIC_ZONE = "publiczone";
429429
public static final String PURGE_RESOURCES = "purgeresources";
430+
public static final String REBALANCE = "rebalance";
430431
public static final String RECEIVED_BYTES = "receivedbytes";
431432
public static final String RECONNECT = "reconnect";
432433
public static final String RECOVER = "recover";

core/src/main/java/org/apache/cloudstack/agent/lb/SetupMSListCommand.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,15 @@ public class SetupMSListCommand extends Command {
2929
private List<String> avoidMsList;
3030
private String lbAlgorithm;
3131
private Long lbCheckInterval;
32+
private Boolean triggerHostLb;
3233

33-
public SetupMSListCommand(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval) {
34+
public SetupMSListCommand(final List<String> msList, final List<String> avoidMsList, final String lbAlgorithm, final Long lbCheckInterval, final Boolean triggerHostLb) {
3435
super();
3536
this.msList = msList;
3637
this.avoidMsList = avoidMsList;
3738
this.lbAlgorithm = lbAlgorithm;
3839
this.lbCheckInterval = lbCheckInterval;
40+
this.triggerHostLb = triggerHostLb;
3941
}
4042

4143
public List<String> getMsList() {
@@ -54,9 +56,12 @@ public Long getLbCheckInterval() {
5456
return lbCheckInterval;
5557
}
5658

59+
public boolean getTriggerHostLb() {
60+
return triggerHostLb;
61+
}
62+
5763
@Override
5864
public boolean executeInSequence() {
5965
return false;
6066
}
61-
6267
}

engine/components-api/src/main/java/com/cloud/agent/AgentManager.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,5 +171,5 @@ enum TapAgentsAction {
171171

172172
void propagateChangeToAgents(Map<String, String> params);
173173

174-
boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs);
174+
boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance);
175175
}

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2193,7 +2193,7 @@ public void propagateChangeToAgents(Map<String, String> params) {
21932193
}
21942194

21952195
@Override
2196-
public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs) {
2196+
public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance) {
21972197
return true;
21982198
}
21992199

engine/orchestration/src/main/java/com/cloud/agent/manager/ClusteredAgentManagerImpl.java

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
import javax.net.ssl.SSLContext;
4343
import javax.net.ssl.SSLEngine;
4444

45+
import com.cloud.resource.ResourceState;
4546
import org.apache.cloudstack.ca.CAManager;
4647
import org.apache.cloudstack.framework.config.ConfigDepot;
4748
import org.apache.cloudstack.framework.config.ConfigKey;
@@ -431,10 +432,10 @@ public boolean routeToPeer(final String peer, final byte[] bytes) {
431432
ch = connectToPeer(peer, ch);
432433
if (ch == null) {
433434
try {
434-
logD(bytes, "Unable to route to peer: " + Request.parse(bytes));
435+
logD(bytes, "Unable to establish connection to route to peer: " + Request.parse(bytes));
435436
} catch (ClassNotFoundException | UnsupportedVersionException e) {
436437
// Request.parse thrown exception when we try to log it, log as much as we can
437-
logD(bytes, "Unable to route to peer, and Request.parse further caught exception" + e.getMessage());
438+
logD(bytes, "Unable to establish connection to route to peer, and Request.parse further caught exception" + e.getMessage());
438439
}
439440
return false;
440441
}
@@ -643,7 +644,6 @@ protected void doTask(final Task task) throws TaskExecutionException {
643644
final Link link = task.getLink();
644645

645646
if (Request.fromServer(data)) {
646-
647647
final AgentAttache agent = findAttache(hostId);
648648

649649
if (Request.isControl(data)) {
@@ -691,7 +691,6 @@ protected void doTask(final Task task) throws TaskExecutionException {
691691
cancel(Long.toString(Request.getManagementServerId(data)), hostId, Request.getSequence(data), e.getMessage());
692692
}
693693
} else {
694-
695694
final long mgmtId = Request.getManagementServerId(data);
696695
if (mgmtId != -1 && mgmtId != _nodeId) {
697696
routeToPeer(Long.toString(mgmtId), data);
@@ -1352,7 +1351,7 @@ private String handleShutdownManagementServerHostCommand(BaseShutdownManagementS
13521351
if (cmd instanceof PrepareForMaintenanceManagementServerHostCommand) {
13531352
logger.debug("Received PrepareForMaintenanceManagementServerHostCommand - preparing for maintenance");
13541353
try {
1355-
managementServerMaintenanceManager.prepareForMaintenance(((PrepareForMaintenanceManagementServerHostCommand) cmd).getLbAlgorithm());
1354+
managementServerMaintenanceManager.prepareForMaintenance(((PrepareForMaintenanceManagementServerHostCommand) cmd).getLbAlgorithm(), ((PrepareForMaintenanceManagementServerHostCommand) cmd).isForced());
13561355
return "Successfully prepared for maintenance";
13571356
} catch(CloudRuntimeException e) {
13581357
return e.getMessage();
@@ -1399,14 +1398,14 @@ private String handleShutdownManagementServerHostCommand(BaseShutdownManagementS
13991398
}
14001399

14011400
@Override
1402-
public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs) {
1401+
public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long timeoutDurationInMs, boolean excludeHostsInMaintenance) {
14031402
if (timeoutDurationInMs <= 0) {
14041403
logger.debug("Not transferring direct agents from management server node {} (id: {}) to other nodes, invalid timeout duration", fromMsId, fromMsUuid);
14051404
return false;
14061405
}
14071406

14081407
long transferStartTimeInMs = System.currentTimeMillis();
1409-
if (CollectionUtils.isEmpty(getDirectAgentHosts(fromMsId))) {
1408+
if (CollectionUtils.isEmpty(getDirectAgentHosts(fromMsId, excludeHostsInMaintenance))) {
14101409
logger.info("No direct agent hosts available on management server node {} (id: {}), to transfer", fromMsId, fromMsUuid);
14111410
return true;
14121411
}
@@ -1421,7 +1420,7 @@ public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long
14211420
int agentTransferFailedCount = 0;
14221421
List<DataCenterVO> dataCenterList = dcDao.listAll();
14231422
for (DataCenterVO dc : dataCenterList) {
1424-
List<HostVO> directAgentHostsInDc = getDirectAgentHostsInDc(fromMsId, dc.getId());
1423+
List<HostVO> directAgentHostsInDc = getDirectAgentHostsInDc(fromMsId, dc.getId(), excludeHostsInMaintenance);
14251424
if (CollectionUtils.isEmpty(directAgentHostsInDc)) {
14261425
continue;
14271426
}
@@ -1455,9 +1454,9 @@ public boolean transferDirectAgentsFromMS(String fromMsUuid, long fromMsId, long
14551454
return (agentTransferFailedCount == 0);
14561455
}
14571456

1458-
private List<HostVO> getDirectAgentHosts(long msId) {
1457+
private List<HostVO> getDirectAgentHosts(long msId, boolean excludeHostsInMaintenance) {
14591458
List<HostVO> directAgentHosts = new ArrayList<>();
1460-
List<HostVO> hosts = _hostDao.listHostsByMs(msId);
1459+
List<HostVO> hosts = _hostDao.listHostsByMsResourceState(msId, null);
14611460
for (HostVO host : hosts) {
14621461
AgentAttache agent = findAttache(host.getId());
14631462
if (agent instanceof DirectAgentAttache) {
@@ -1468,9 +1467,11 @@ private List<HostVO> getDirectAgentHosts(long msId) {
14681467
return directAgentHosts;
14691468
}
14701469

1471-
private List<HostVO> getDirectAgentHostsInDc(long msId, long dcId) {
1470+
private List<HostVO> getDirectAgentHostsInDc(long msId, long dcId, boolean excludeHostsInMaintenance) {
14721471
List<HostVO> directAgentHosts = new ArrayList<>();
1473-
List<HostVO> hosts = _hostDao.listHostsByMsAndDc(msId, dcId);
1472+
// To exclude maintenance states use values from ResourceState as source of truth
1473+
List<ResourceState> statesToExclude = excludeHostsInMaintenance ? ResourceState.s_maintenanceStates : List.of();
1474+
List<HostVO> hosts = _hostDao.listHostsByMsDcResourceState(msId, dcId, statesToExclude);
14741475
for (HostVO host : hosts) {
14751476
AgentAttache agent = findAttache(host.getId());
14761477
if (agent instanceof DirectAgentAttache) {
@@ -1506,6 +1507,10 @@ public void onManagementServerPreparingForMaintenance() {
15061507
public void onManagementServerCancelPreparingForMaintenance() {
15071508
logger.debug("Management server cancel preparing for maintenance");
15081509
super.onManagementServerPreparingForMaintenance();
1510+
1511+
// needed for the case when Management Server in Preparing For Maintenance but didn't go to Maintenance state
1512+
// (where this variable will be reset)
1513+
_agentLbHappened = false;
15091514
}
15101515

15111516
@Override

engine/schema/src/main/java/com/cloud/host/dao/HostDao.java

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -177,14 +177,24 @@ public interface HostDao extends GenericDao<HostVO, Long>, StateDao<Status, Stat
177177

178178
List<HostVO> listHostsByMsAndDc(long msId, long dcId);
179179

180+
List<HostVO> listHostsByMsDcResourceState(long msId, long dcId, List<ResourceState> excludedResourceStates);
181+
180182
List<HostVO> listHostsByMs(long msId);
181183

184+
List<HostVO> listHostsByMsResourceState(long msId, List<ResourceState> excludedResourceStates);
185+
182186
/**
183-
* Retrieves the number of hosts/agents this {@see ManagementServer} has responsibility over.
184-
* @param msId the id of the {@see ManagementServer}
185-
* @return the number of hosts/agents this {@see ManagementServer} has responsibility over
187+
* Count Hosts by given Management Server, Host and Hypervisor Types,
188+
* and exclude Hosts with given Resource States.
189+
*
190+
* @param msId Management Server Id
191+
* @param excludedResourceStates Resource States to be excluded
192+
* @param hostTypes Host Types
193+
* @param hypervisorTypes Hypervisor Types
194+
* @return Hosts count
186195
*/
187-
int countByMs(long msId);
196+
int countHostsByMsResourceStateTypeAndHypervisorType(long msId, List<ResourceState> excludedResourceStates,
197+
List<Type> hostTypes, List<HypervisorType> hypervisorTypes);
188198

189199
/**
190200
* Retrieves the host ids/agents this {@see ManagementServer} has responsibility over.

engine/schema/src/main/java/com/cloud/host/dao/HostDaoImpl.java

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
import com.cloud.utils.db.GenericSearchBuilder;
7373
import com.cloud.utils.db.JoinBuilder;
7474
import com.cloud.utils.db.JoinBuilder.JoinType;
75+
import com.cloud.utils.db.QueryBuilder;
7576
import com.cloud.utils.db.SearchBuilder;
7677
import com.cloud.utils.db.SearchCriteria;
7778
import com.cloud.utils.db.SearchCriteria.Func;
@@ -1600,6 +1601,17 @@ public List<HostVO> listHostsByMsAndDc(long msId, long dcId) {
16001601
return listBy(sc);
16011602
}
16021603

1604+
@Override
1605+
public List<HostVO> listHostsByMsDcResourceState(long msId, long dcId, List<ResourceState> excludedResourceStates) {
1606+
QueryBuilder<HostVO> sc = QueryBuilder.create(HostVO.class);
1607+
sc.and(sc.entity().getManagementServerId(), Op.EQ, msId);
1608+
sc.and(sc.entity().getDataCenterId(), Op.EQ, dcId);
1609+
if (CollectionUtils.isNotEmpty(excludedResourceStates)) {
1610+
sc.and(sc.entity().getResourceState(), Op.NIN, excludedResourceStates.toArray());
1611+
}
1612+
return listBy(sc.create());
1613+
}
1614+
16031615
@Override
16041616
public List<HostVO> listHostsByMs(long msId) {
16051617
SearchCriteria<HostVO> sc = ResponsibleMsSearch.create();
@@ -1608,10 +1620,32 @@ public List<HostVO> listHostsByMs(long msId) {
16081620
}
16091621

16101622
@Override
1611-
public int countByMs(long msId) {
1612-
SearchCriteria<HostVO> sc = ResponsibleMsSearch.create();
1613-
sc.setParameters("managementServerId", msId);
1614-
return getCount(sc);
1623+
public List<HostVO> listHostsByMsResourceState(long msId, List<ResourceState> excludedResourceStates) {
1624+
QueryBuilder<HostVO> sc = QueryBuilder.create(HostVO.class);
1625+
sc.and(sc.entity().getManagementServerId(), Op.EQ, msId);
1626+
if (CollectionUtils.isNotEmpty(excludedResourceStates)) {
1627+
sc.and(sc.entity().getResourceState(), Op.NIN, excludedResourceStates.toArray());
1628+
}
1629+
return listBy(sc.create());
1630+
}
1631+
1632+
@Override
1633+
public int countHostsByMsResourceStateTypeAndHypervisorType(long msId,
1634+
List<ResourceState> excludedResourceStates,
1635+
List<Type> hostTypes,
1636+
List<HypervisorType> hypervisorTypes) {
1637+
QueryBuilder<HostVO> sc = QueryBuilder.create(HostVO.class);
1638+
sc.and(sc.entity().getManagementServerId(), Op.EQ, msId);
1639+
if (CollectionUtils.isNotEmpty(excludedResourceStates)) {
1640+
sc.and(sc.entity().getResourceState(), Op.NIN, excludedResourceStates.toArray());
1641+
}
1642+
if (CollectionUtils.isNotEmpty(hostTypes)) {
1643+
sc.and(sc.entity().getType(), Op.IN, hostTypes.toArray());
1644+
}
1645+
if (CollectionUtils.isNotEmpty(hypervisorTypes)) {
1646+
sc.and(sc.entity().getHypervisorType(), Op.IN, hypervisorTypes.toArray());
1647+
}
1648+
return getCount(sc.create());
16151649
}
16161650

16171651
@Override

framework/agent-lb/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLB.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,11 @@ public interface IndirectAgentLB {
7070
*/
7171
Long getLBPreferredHostCheckInterval(Long clusterId);
7272

73-
void propagateMSListToAgents();
73+
void propagateMSListToAgents(boolean triggerHostLB);
7474

75-
boolean haveAgentBasedHosts(long msId);
75+
void propagateMSListToAgentsInCluster(Long clusterId);
7676

77-
boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorithm, long timeoutDurationInMs);
77+
boolean haveAgentBasedHosts(long msId, boolean excludeHostsInMaintenance);
78+
79+
boolean migrateAgents(String fromMsUuid, long fromMsId, String lbAlgorithm, long timeoutDurationInMs, boolean excludeHostsInMaintenance);
7880
}

plugins/maintenance/src/main/java/org/apache/cloudstack/api/command/CancelMaintenanceCmd.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,15 @@
1818
package org.apache.cloudstack.api.command;
1919

2020
import org.apache.cloudstack.api.APICommand;
21+
import org.apache.cloudstack.api.ApiConstants;
2122
import org.apache.cloudstack.api.BaseCmd;
2223

2324
import com.cloud.user.Account;
2425

26+
import org.apache.cloudstack.api.Parameter;
2527
import org.apache.cloudstack.api.response.ManagementServerMaintenanceResponse;
2628
import org.apache.cloudstack.acl.RoleType;
29+
import org.apache.commons.lang3.BooleanUtils;
2730

2831
@APICommand(name = CancelMaintenanceCmd.APINAME,
2932
description = "Cancels maintenance of the management server",
@@ -36,6 +39,13 @@ public class CancelMaintenanceCmd extends BaseMSMaintenanceActionCmd {
3639

3740
public static final String APINAME = "cancelMaintenance";
3841

42+
@Parameter(name = ApiConstants.REBALANCE, type = CommandType.BOOLEAN, description = "Rebalance agents (applicable for indirect agents) after cancelling maintenance, default is true")
43+
private Boolean rebalance;
44+
45+
public boolean getRebalance() {
46+
return BooleanUtils.toBooleanDefaultIfNull(rebalance, true);
47+
}
48+
3949
@Override
4050
public String getCommandName() {
4151
return APINAME.toLowerCase() + BaseCmd.RESPONSE_SUFFIX;

0 commit comments

Comments
 (0)