Skip to content

Commit

Permalink
test(robot): add single replica node down cases
Browse files Browse the repository at this point in the history
Signed-off-by: Chris <chris.chien@suse.com>
  • Loading branch information
chriscchien committed May 9, 2024
1 parent 50c91e1 commit f181215
Show file tree
Hide file tree
Showing 19 changed files with 356 additions and 1 deletion.
2 changes: 2 additions & 0 deletions e2e/keywords/common.resource
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Library ../libs/keywords/backupstore_keywords.py
Library ../libs/keywords/storageclass_keywords.py
Library ../libs/keywords/node_keywords.py
Library ../libs/keywords/backing_image_keywords.py
Library ../libs/keywords/host_keywords.py

*** Keywords ***
Set test environment
Expand All @@ -26,6 +27,7 @@ Set test environment
set_backupstore

Cleanup test resources
power_on_all_nodes
cleanup_control_plane_network_latency
cleanup_node_exec
cleanup_stress_helper
Expand Down
1 change: 1 addition & 0 deletions e2e/keywords/deployment.resource
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ Check deployment ${deployment_id} works
Wait for deployment ${deployment_id} pods stable
${deployment_name} = generate_name_with_suffix deployment ${deployment_id}
wait_for_workload_pods_stable ${deployment_name}

6 changes: 6 additions & 0 deletions e2e/keywords/host.resource
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,9 @@ Power off all worker nodes for ${power_off_time_in_min} mins
Restart cluster
reboot_all_nodes
setup_control_plane_network_latency

Power on off nodes
power_on_all_nodes

Power on off node
power_on_node_by_name ${powered_off_node}
1 change: 1 addition & 0 deletions e2e/keywords/persistentvolumeclaim.resource
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Documentation PersistentVolumeClaim Keywords
Library Collections
Library ../libs/keywords/common_keywords.py
Library ../libs/keywords/persistentvolumeclaim_keywords.py
Library ../libs/keywords/volume_keywords.py

*** Keywords ***
Create persistentvolumeclaim ${claim_id} using ${volume_type} volume
Expand Down
1 change: 1 addition & 0 deletions e2e/keywords/statefulset.resource
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Documentation StatefulSet Keywords
Library Collections
Library ../libs/keywords/common_keywords.py
Library ../libs/keywords/statefulset_keywords.py
Library ../libs/keywords/volume_keywords.py

*** Keywords ***
Create statefulset ${statefulset_id} using ${volume_type} volume
Expand Down
33 changes: 33 additions & 0 deletions e2e/keywords/workload.resource
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@ Power off volume node of ${workload_kind} ${workload_id} for ${duration} minutes
${volume_name} = get_workload_volume_name ${workload_name}
reboot_volume_node ${volume_name} ${duration}

Power off volume node of ${workload_kind} ${workload_id}
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
${powered_off_node} = get_volume_node ${volume_name}
${last_volume_node} = get_volume_node ${volume_name}
power_off_volume_node ${volume_name}
Set Test Variable ${powered_off_node}
Set Test Variable ${last_volume_node}

Reboot volume node of ${workload_kind} ${workload_id}
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
Expand Down Expand Up @@ -56,6 +65,16 @@ Wait for volume of ${workload_kind} ${workload_id} attached and unknown
${volume_name} = get_workload_volume_name ${workload_name}
wait_for_volume_unknown ${volume_name}

Wait for volume of ${workload_kind} ${workload_id} faulted
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
wait_for_volume_faulted ${volume_name}

Wait for volume of ${workload_kind} ${workload_id} attaching
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
wait_for_volume_attaching ${volume_name}

Wait for volume of ${workload_kind} ${workload_id} attached and degraded
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
Expand Down Expand Up @@ -94,3 +113,17 @@ Wait for workloads pods stable
Append To List ${workload_list} ${workload_name}
END
wait_for_workloads_pods_stably_running ${workload_list}

Delete replica of ${workload_kind} ${workload_id} volume on all ${replica_locality}
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
delete_replica_on_nodes ${volume_name} ${replica_locality}

Update volume of ${workload_kind} ${workload_id} replica count to ${replica_count}
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
update_volume_spec ${volume_name} numberOfReplicas ${replica_count}

Wait for ${workload_kind} ${workload_id} pods on ${replica_locality} kept in state ${expect_state}
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
wait_for_workload_pods_kept_in_state ${workload_name} ${replica_locality} ${expect_state} ${last_volume_node}
30 changes: 30 additions & 0 deletions e2e/libs/host/host.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,33 @@ def reboot_all_worker_nodes(self, shut_down_time_in_sec=NODE_REBOOT_DOWN_TIME_SE
waiter = self.aws_client.get_waiter('instance_running')
waiter.wait(InstanceIds=instance_ids)
logging(f"Started instances")

def power_off_node(self, power_off_node_name):
instance_ids = [self.mapping[power_off_node_name]]
resp = self.aws_client.stop_instances(InstanceIds=instance_ids, Force=True)
assert resp['ResponseMetadata']['HTTPStatusCode'] == 200, f"Failed to stop instances {instance_ids} response: {resp}"
logging(f"Stopping instances {instance_ids}")
waiter = self.aws_client.get_waiter('instance_stopped')
waiter.wait(InstanceIds=instance_ids)
logging(f"Stopped instances")

def power_on_node(self, power_on_node_name):
instance_ids = [self.mapping[power_on_node_name]]

resp = self.aws_client.start_instances(InstanceIds=instance_ids)
logging(f"Starting instances {instance_ids} response: {resp}")
waiter = self.aws_client.get_waiter('instance_running')
waiter.wait(InstanceIds=instance_ids)
logging(f"Started instances")

def power_on_all_nodes(self):
instance_ids = [value for value in self.mapping.values()]

resp = self.aws_client.start_instances(InstanceIds=instance_ids)
logging(f"Starting instances {instance_ids} response: {resp}")
waiter = self.aws_client.get_waiter('instance_running')
waiter.wait(InstanceIds=instance_ids)

wait_for_cluster_ready()

logging(f"Started instances")
17 changes: 17 additions & 0 deletions e2e/libs/keywords/host_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,20 @@ def reboot_node_by_name(self, node_name, downtime_in_min=1):

logging(f'Rebooting node {node_name} with downtime {reboot_down_time_sec} seconds')
self.host.reboot_node(node_name, reboot_down_time_sec)

def power_off_volume_node(self, volume_name):
node_id = self.volume_keywords.get_node_id_by_replica_locality(volume_name, "volume node")
logging(f'Power off volume {volume_name} node {node_id}')
self.host.power_off_node(node_id)

def power_on_volume_node(self, volume_name):
node_id = self.volume_keywords.get_node_id_by_replica_locality(volume_name, "volume node")

logging(f'Power on volume {volume_name} node {node_id}')
self.host.power_on_node(node_id)

def power_on_all_nodes(self):
self.host.power_on_all_nodes()

def power_on_node_by_name(self, node_name):
self.host.power_on_node(node_name)
19 changes: 19 additions & 0 deletions e2e/libs/keywords/volume_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,14 @@ def delete_replica_on_node(self, volume_name, replica_locality):
logging(f"Deleting volume {volume_name}'s replica on node {node_name}")
self.volume.delete_replica(volume_name, node_name)

def delete_replica_on_nodes(self, volume_name, replica_locality):
check_replica_locality(replica_locality)

node_ids = self.get_node_ids_by_replica_locality(volume_name, replica_locality)
for node_id in node_ids:
logging(f"Deleting volume {volume_name}'s replica on node {node_id}")
self.volume.delete_replica(volume_name, node_id)

def set_annotation(self, volume_name, annotation_key, annotation_value):
self.volume.set_annotation(volume_name, annotation_key, annotation_value)

Expand Down Expand Up @@ -204,6 +212,14 @@ def wait_for_volume_healthy(self, volume_name):
logging(f'Waiting for volume {volume_name} to be healthy')
self.volume.wait_for_volume_healthy(volume_name)

def wait_for_volume_attaching(self, volume_name):
logging(f'Waiting for volume {volume_name} to be in attaching')
self.volume.wait_for_volume_attaching(volume_name)

def wait_for_volume_faulted(self, volume_name):
logging(f'Waiting for volume {volume_name} to be in faulted')
self.volume.wait_for_volume_faulted(volume_name)

def wait_for_volume_migration_ready(self, volume_name):
logging(f'Waiting for volume {volume_name} migration to be ready')
self.volume.wait_for_volume_migration_ready(volume_name)
Expand All @@ -220,3 +236,6 @@ def wait_for_volume_degraded(self, volume_name):

def wait_for_volume_unknown(self, volume_name):
self.volume.wait_for_volume_unknown(volume_name)

def update_volume_spec(self, volume_name, key, value):
self.volume.update_volume_spec(volume_name, key, value)
20 changes: 20 additions & 0 deletions e2e/libs/keywords/workload_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@
from workload.workload import write_pod_random_data
from workload.workload import wait_for_workload_pods_running
from workload.workload import wait_for_workload_pods_stable
from workload.workload import wait_for_workload_pods_kept_in_state

from utility.constant import ANNOT_CHECKSUM
from utility.constant import ANNOT_EXPANDED_SIZE
from utility.utility import logging
from node.utility import check_replica_locality
from node.node import Node

from volume import Volume
from volume.constant import MEBIBYTE
Expand Down Expand Up @@ -121,3 +124,20 @@ def wait_for_workload_claim_size_expanded(self, workload_name, claim_index=0):
logging(f'Waiting for {workload_name} volume {volume_name} to expand to {expanded_size}')
self.volume.wait_for_volume_expand_to_size(volume_name, expanded_size)
self.volume.wait_for_volume_detached(volume_name)

def wait_for_workload_pods_kept_in_state(self, workload_name, replica_locality, expect_state, last_volume_node, namespace="default"):
check_replica_locality(replica_locality)
assert expect_state in ["Terminating", "ContainerCreating"], f"Unknown expected pod state: {expect_state}: "

# Faulted volume can not utilize "get_node_id_by_replica_locality", distinguish replica nodes and volume node one by one
logging(f'Waiting for workloads {workload_name} pods on {replica_locality} kept in {expect_state}')
nodes_to_check = []
worker_nodes = Node.list_node_names_by_role("worker")
if replica_locality == "volume node":
nodes_to_check.append(last_volume_node)
elif replica_locality == "replica node":
for node in worker_nodes:
if node != last_volume_node:
nodes_to_check.append(node)

wait_for_workload_pods_kept_in_state(workload_name, nodes_to_check, replica_locality, expect_state, namespace=namespace)
4 changes: 4 additions & 0 deletions e2e/libs/persistentvolumeclaim/crd.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,7 @@ def expand(self, claim_name, size, namespace="default"):
logging(f"Exception when expanding PVC: {e}")

return size

def get_volume_name(self, claim_name, claim_namespace):
claim = self.get(claim_name, claim_namespace)
return claim.spec.volume_name
3 changes: 3 additions & 0 deletions e2e/libs/persistentvolumeclaim/persistentvolumeclaim.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,6 @@ def expand(self, claim_name, size_in_byte):
logging(f"Expanding PVC {claim_name} from {current_size} to {target_size}")
expanded_size = self.claim.expand(claim_name, target_size)
self.set_annotation(claim_name, ANNOT_EXPANDED_SIZE, str(expanded_size))

def get_volume_name(self, claim_name, claim_namespace):
return self.claim.get_volume_name(claim_name, claim_namespace)
26 changes: 26 additions & 0 deletions e2e/libs/volume/crd.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,3 +374,29 @@ def validate_volume_replicas_anti_affinity(self, volume_name):
for replica in replica_list:
node_set.add(replica['status']['ownerID'])
assert len(replica_list) == len(node_set), f"unexpected replicas on the same node: {replica_list}"

def update_volume_spec(self, volume_name, key, value):
# retry conflict error
for i in range(self.retry_count):
try:
volume = self.get(volume_name)
spec = volume['spec']
if key == "numberOfReplicas":
spec[key] = int(value)
else:
spec[key] = value
self.obj_api.replace_namespaced_custom_object(
group="longhorn.io",
version="v1beta2",
namespace="longhorn-system",
plural="volumes",
name=volume_name,
body=volume
)
break
except Exception as e:
if e.status == 409:
logging(f"Conflict error: {e.body}, retry ({i}) ...")
else:
raise e
time.sleep(self.retry_interval)
3 changes: 3 additions & 0 deletions e2e/libs/volume/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,6 @@ def check_data_checksum(self, volume_name, data_id):

def cleanup(self, volume_names):
return NotImplemented

def update_volume_spec(self, volume_name, key, value):
return NotImplemented
9 changes: 9 additions & 0 deletions e2e/libs/volume/volume.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@ def wait_for_volume_attached(self, volume_name):
def wait_for_volume_detached(self, volume_name):
self.volume.wait_for_volume_state(volume_name, "detached")

def wait_for_volume_attaching(self, volume_name):
self.volume.wait_for_volume_state(volume_name, "attaching")

def wait_for_volume_faulted(self, volume_name):
self.volume.wait_for_volume_state(volume_name, "detached")
self.volume.wait_for_volume_robustness(volume_name, "faulted")

def wait_for_volume_healthy(self, volume_name):
self.volume.wait_for_volume_state(volume_name, "attached")
self.volume.wait_for_volume_robustness(volume_name, "healthy")
Expand Down Expand Up @@ -106,3 +113,5 @@ def check_data_checksum(self, volume_name, data_id):
def validate_volume_replicas_anti_affinity(self, volume_name):
return self.volume.validate_volume_replicas_anti_affinity(volume_name)

def update_volume_spec(self, volume_name, key, value):
return self.volume.update_volume_spec(volume_name, key, value)
1 change: 1 addition & 0 deletions e2e/libs/workload/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
IMAGE_UBUNTU = 'ubuntu:16.04'

WAIT_FOR_POD_STABLE_MAX_RETRY = 120
WAIT_FOR_POD_KEPT_IN_STATE_TIME = 130
41 changes: 40 additions & 1 deletion e2e/libs/workload/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from utility.utility import logging

from workload.constant import WAIT_FOR_POD_STABLE_MAX_RETRY

from workload.constant import WAIT_FOR_POD_KEPT_IN_STATE_TIME

def get_workload_pod_names(workload_name):
api = client.CoreV1Api()
Expand Down Expand Up @@ -182,3 +182,42 @@ async def wait_for_workload_pods_stable(workload_name, namespace="default"):
await asyncio.sleep(retry_interval)

assert False, f"Timeout waiting for {workload_name} pods {wait_for_stable_pod} stable)"


def wait_for_workload_pods_kept_in_state(workload_name, nodes_to_check, expect_state, namespace="default"):
# If pod locate on replica_locaality nodes, check pod state kept in expect_state.
retry_count, retry_interval = get_retry_count_and_interval()
pod_in_state_count = {}
wait_for_in_state_pod = []

for i in range(retry_count):
pods = get_workload_pods(workload_name, namespace=namespace)
pods_on_sepecific_nodes = []
# Collect pods on specific node
for pod in pods:
if pod.spec.node_name in nodes_to_check:
pods_on_sepecific_nodes.append(pod)

if len(pods_on_sepecific_nodes) > 0:
for pod in pods_on_sepecific_nodes:
pod_name = pod.metadata.name
if pod_name not in pod_in_state_count:
pod_in_state_count[pod_name] = 0
elif (expect_state == "ContainerCreating" and pod.status.phase == "Pending") or \
(expect_state == "Terminating" and hasattr(pod.metadata, "deletion_timestamp") and pod.status.phase == "Running"):
pod_in_state_count[pod_name] += 1
else:
pod_in_state_count[pod_name] = 0

wait_for_in_state_pod = []
for pod in pods_on_sepecific_nodes:
pod_name = pod.metadata.name
logging(f'Waiting for pod {pod_name} on {nodes_to_check} kept in {expect_state}')
if pod_in_state_count[pod_name] < WAIT_FOR_POD_KEPT_IN_STATE_TIME:
wait_for_in_state_pod.append(pod_name)

if len(wait_for_in_state_pod) == 0:
return
time.sleep(retry_interval)

assert False, f"Timeout waiting for {workload_name} pods on {nodes_to_check} {expect_state})"
2 changes: 2 additions & 0 deletions e2e/templates/workload/statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ spec:
volumeClaimTemplates:
- metadata:
name: pod-data
labels:
test.longhorn.io: e2e
spec:
accessModes:
- ReadWriteOnce
Expand Down
Loading

0 comments on commit f181215

Please sign in to comment.