Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for multiple assisted-service pods #2148

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import datetime
import logging
import time
from enum import Enum
Expand Down Expand Up @@ -211,6 +212,20 @@ def wait_for_scale_config_change(
waiting_for=f"replicas to change successfully to {replicas_count}",
)

@staticmethod
def containers_ready_inside_pod(pod: kubernetes.dynamic.Resource) -> bool:
logging.info(f"Checking if containers are ready inside pod {pod.metadata.name}")
container_statuses_exists = hasattr(pod.status, "containerStatuses")
if not (container_statuses_exists and pod.status.containerStatuses):
return False
containers_status = list(
map(
lambda container: hasattr(container, "ready") and str(container["ready"]).casefold() == "true",
pod.status.containerStatuses,
)
)
return all(containers_status)

def wait_for_pod_change(self, name: str, replicas_count: int, timeout=180, sleep=10, recover_time_pods=20) -> None:
"""Verify pod created / deleted based on replicas numbers
In case we create pods (replicas) need to wait for a running state
Expand All @@ -233,8 +248,10 @@ def wait_for_pod_change(self, name: str, replicas_count: int, timeout=180, sleep
def pods_are_running():
# Waiter lambda verify all pods are running begins with the same name
pods = self._get_obj_resources_by_name(self.Resources.POD.value, name)
running = list(map(lambda pod: hasattr(pod.status, "phase") and pod.status.phase == "Running", pods))
return all(running)
running_pods = list(map(self.containers_ready_inside_pod, pods))
# make sure all pods are running , in restartAt we may have parallel creation/termination
logging.info(f"Check if total pods are equal to running and ready {running_pods} vs {len(pods)}")
return len(pods) == len(running_pods) and all(running_pods)
Comment on lines +251 to +254
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could be simpler to check if the full deployment has been rolled out successfully?

You can get inspiration from here:
https://github.com/fiaas/fiaas-deploy-daemon/pull/47/files#diff-2a85b9b3dfebaeb782d011387395182908f17ba26d491c029850f6ddcae02cecR66-R69

it does the same check as kubectl rollout status ...


waiting.wait(
lambda: pods_are_running() is True,
Expand Down Expand Up @@ -282,6 +299,27 @@ def change_configmap_data(
if restart_deployment:
self._restart_deployment(deployment_name)

def _delete_leftovers_pods_replicas(self) -> None:
"""Delete all replicaset with zero count or pods that not in ready status
Post test we must clean all resources and restore setup to last good
state without additional replicase (when RestartAt run there are leftovers)
:return:
"""
logging.info("Collecting leftovers for pods or replicaset for deletion")
pods = self._get_obj_resources_by_name(self.Resources.POD.value, self.assisted_service_name)
pod_not_running = [pod for pod in pods if not self.containers_ready_inside_pod(pod)]
logging.info(f"Found {len(pod_not_running)} leftover pod")

replicasets = self._get_obj_resources_by_name(self.Resources.REPLICASET.value, self.assisted_service_name)
replica_zero = [replica for replica in replicasets if replica.status.replicas == 0]
logging.info(f"Found {len(replica_zero)} leftover with zero replicaset")
for pod in pod_not_running:
logging.info(f"Cleanup leftover pod {pod.metadata.name} not in Running state")
self.resource_kind(self.Resources.POD.value).delete(pod.metadata.name, namespace=self.namespace)
for replica in replica_zero:
logging.info(f"Cleanup leftover replicaset {replica.metadata.name} with zero replicas")
self.resource_kind(self.Resources.REPLICASET.value).delete(replica.metadata.name, namespace=self.namespace)

def rollout_assisted_service(
self, configmap_before: dict[str, Any], configmap_after: dict[str, Any], request_node_name: str
) -> None:
Expand All @@ -293,9 +331,43 @@ def rollout_assisted_service(
:param request_node_name: the rest_func name accepted by fixture
:return:
"""
self._delete_leftovers_pods_replicas()
if configmap_before != configmap_after:
logging.debug(f"configmap data {self.configmap_name} changed during {request_node_name} - restoring")
self.patch_resource(self.Resources.CONFIGMAP.value, self.configmap_name, {"data": configmap_before})
self._restart_deployment(self.assisted_service_name, request_node_name)
else:
logging.info(f"configmap {self.configmap_name} was not changed during the test {request_node_name}")

def restart_deployment_now(self, total_pods: int, wait_for_pods: bool = True) -> None:
"""Restart deployment for parallel replicas (>1)
RestartAt managed the replicas and pods re-creation and has leftovers
After restart we have additional replicaset with 0 pods for rollout probably
The pods are re-creating safely terminated/deleted one by one
Will allow us to run upgrade in parallel because we should have running pods
during the restart
:param total_pods:
:param wait_for_pods:
:return:
"""
date = datetime.datetime.now(datetime.timezone.utc).isoformat()
body = {
"spec": {
"template": {
"metadata": {
"annotations": {
"kubectl.kubernetes.io/restartedAt": date,
}
}
},
"replicas": total_pods,
}
}

# Need to restart deployment without downtime - kubernetes manages the order of reboot
self.resource_kind(self.Resources.DEPLOYMENT.value).patch(
namespace=self.namespace, name=self.assisted_service_name, body=body
)
if wait_for_pods:
logging.info(f"Waiting on restartAt deployment now expecting to {total_pods} pods to be in Running state")
self.wait_for_pod_change(self.assisted_service_name, total_pods)