Skip to content

Commit

Permalink
test: implement node network disconnection
Browse files Browse the repository at this point in the history
Signed-off-by: Yang Chiu <yang.chiu@suse.com>
  • Loading branch information
yangchiu committed Apr 8, 2024
1 parent 68f683c commit f83cce6
Show file tree
Hide file tree
Showing 12 changed files with 115 additions and 56 deletions.
1 change: 1 addition & 0 deletions e2e/keywords/common.resource
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Library ../libs/keywords/statefulset_keywords.py
Library ../libs/keywords/stress_keywords.py
Library ../libs/keywords/volume_keywords.py
Library ../libs/keywords/workload_keywords.py
Library ../libs/keywords/persistentvolumeclaim_keywords.py

*** Keywords ***
Set test environment
Expand Down
16 changes: 16 additions & 0 deletions e2e/keywords/network.resource
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
*** Settings ***
Documentation Common keywords
Library ../libs/keywords/network_keywords.py
Library ../libs/keywords/workload_keywords.py
Library ../libs/keywords/volume_keywords.py

*** Variables ***


*** Keywords ***
Disconnect volume node network of statefulset ${statefulset_id} for ${duration} seconds
${workload_name} = generate_name_with_suffix statefulset ${statefulset_id}
${volume_name} = get_workload_volume_name ${workload_name}
${node_name} = get_volume_node ${volume_name}
disconnect_node_network ${node_name} ${duration}
4 changes: 4 additions & 0 deletions e2e/libs/keywords/network_keywords.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from network.network import setup_control_plane_network_latency
from network.network import cleanup_control_plane_network_latency
from network.network import disconnect_node_network

from utility.utility import logging

Expand All @@ -13,3 +14,6 @@ def setup_control_plane_network_latency(self):
def cleanup_control_plane_network_latency(self):
logging(f"Cleaning up control plane network latency")
cleanup_control_plane_network_latency()

def disconnect_node_network(self, node_name, disconnection_time_in_sec):
disconnect_node_network(node_name, int(disconnection_time_in_sec))
3 changes: 3 additions & 0 deletions e2e/libs/keywords/volume_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ def get_replica_node_ids(self, volume_name):
node_ids.extend(self.get_node_ids_by_replica_locality(volume_name, "test pod node"))
return node_ids

def get_volume_node(self, volume_name):
return self.get_node_id_by_replica_locality(volume_name, "volume node")

def get_node_id_by_replica_locality(self, volume_name, replica_locality):
return self.get_node_ids_by_replica_locality(volume_name, replica_locality)[0]

Expand Down
20 changes: 19 additions & 1 deletion e2e/libs/network/network.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from robot.libraries.BuiltIn import BuiltIn

from node import Node
from node_exec import NodeExec
from workload.pod import create_pod
from workload.pod import delete_pod
from workload.pod import new_pod_manifest
from workload.pod import IMAGE_BUSYBOX
import time


def get_control_plane_node_network_latency_in_ms():
Expand Down Expand Up @@ -31,3 +35,17 @@ def cleanup_control_plane_network_latency():
cmd = f"tc qdisc show dev eth0 | grep -v delay"
res = NodeExec.get_instance().issue_cmd(control_plane_node, cmd)
assert res, "cleanup control plane network failed"

def disconnect_node_network(node_name, disconnection_time_in_sec=10):
manifest = new_pod_manifest(
image=IMAGE_BUSYBOX,
command=["nsenter", "--mount=/rootfs/proc/1/ns/mnt", "--net=/rootfs/proc/1/ns/net", "--", "sh"],
args=["-c", f"sleep 10 && tc qdisc replace dev eth0 root netem loss 100% && sleep {disconnection_time_in_sec} && tc qdisc del dev eth0 root"],
node_name=node_name
)
pod_name = manifest['metadata']['name']
create_pod(manifest, is_wait_for_pod_running=True)

time.sleep(disconnection_time_in_sec)

delete_pod(pod_name)
15 changes: 9 additions & 6 deletions e2e/libs/node_exec/node_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,15 @@ def set_namespace(self, namespace):
def cleanup(self):
for pod in self.node_exec_pod.values():
logging(f"Cleaning up pod {pod.metadata.name} {pod.metadata.uid}")
res = self.core_api.delete_namespaced_pod(
name=pod.metadata.name,
namespace=self.namespace,
body=client.V1DeleteOptions()
)
wait_delete_pod(pod.metadata.name)
try:
res = self.core_api.delete_namespaced_pod(
name=pod.metadata.name,
namespace=self.namespace,
body=client.V1DeleteOptions()
)
wait_delete_pod(pod.metadata.uid)
except Exception as e:
assert e.status == 404
self.core_api.delete_namespace(
name=self.namespace
)
Expand Down
24 changes: 15 additions & 9 deletions e2e/libs/volume/crd.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,37 +206,43 @@ def wait_for_volume_delete(self, volume_name):
assert False, f"expect volume {volume_name} deleted but it still exists"

def wait_for_volume_state(self, volume_name, desired_state):
volume = None
for i in range(self.retry_count):
logging(f"Waiting for {volume_name} {desired_state} ({i}) ...")
try:
if self.get(volume_name)["status"]["state"] == desired_state:
volume = self.get(volume_name)
if volume["status"]["state"] == desired_state:
break
except Exception as e:
logging(f"Getting volume {self.get(volume_name)} status error: {e}")
logging(f"Getting volume {volume} status error: {e}")
time.sleep(self.retry_interval)
assert self.get(volume_name)["status"]["state"] == desired_state
assert volume["status"]["state"] == desired_state

def wait_for_volume_robustness(self, volume_name, desired_state):
volume = None
for i in range(self.retry_count):
logging(f"Waiting for {volume_name} {desired_state} ({i}) ...")
try:
if self.get(volume_name)["status"]["robustness"] == desired_state:
volume = self.get(volume_name)
if volume["status"]["robustness"] == desired_state:
break
except Exception as e:
logging(f"Getting volume {self.get(volume_name)} robustness error: {e}")
logging(f"Getting volume {volume} robustness error: {e}")
time.sleep(self.retry_interval)
assert self.get(volume_name)["status"]["robustness"] == desired_state
assert volume["status"]["robustness"] == desired_state

def wait_for_volume_robustness_not(self, volume_name, not_desired_state):
volume = None
for i in range(self.retry_count):
logging(f"Waiting for {volume_name} robustness not {not_desired_state} ({i}) ...")
try:
if self.get(volume_name)["status"]["robustness"] != not_desired_state:
volume = self.get(volume_name)
if volume["status"]["robustness"] != not_desired_state:
break
except Exception as e:
logging(f"Getting volume {self.get(volume_name)} robustness error: {e}")
logging(f"Getting volume {volume} robustness error: {e}")
time.sleep(self.retry_interval)
assert self.get(volume_name)["status"]["robustness"] != not_desired_state
assert volume["status"]["robustness"] != not_desired_state

def wait_for_volume_expand_to_size(self, volume_name, expected_size):
engine = None
Expand Down
2 changes: 1 addition & 1 deletion e2e/libs/volume/volume.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ def wait_for_volume_state(self, volume_name, desired_state):
return self.volume.wait_for_volume_state(volume_name, desired_state)

def wait_for_volume_attached(self, volume_name):
self.volume.wait_for_volume_state(volume_name, "attached")
self.volume.wait_for_volume_robustness_not(volume_name, "unknown")
self.volume.wait_for_volume_state(volume_name, "attached")

def wait_for_volume_detached(self, volume_name):
self.volume.wait_for_volume_state(volume_name, "detached")
Expand Down
2 changes: 1 addition & 1 deletion e2e/libs/workload/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
IMAGE_LITMUX = 'litmuschaos/go-runner:latest'
IMAGE_UBUNTU = 'ubuntu:16.04'

WAIT_FOR_POD_STABLE_MAX_RETRY = 60
WAIT_FOR_POD_STABLE_MAX_RETRY = 120
10 changes: 9 additions & 1 deletion e2e/libs/workload/pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def new_pod_manifest(pod_name="", image="", command=[], args=[],
claim_name="", node_name="", labels={}):
if pod_name == "":
pod_name = generate_name_random()

logging(f"Creating pod for {command} {args} on {node_name}")
# Set default image and args
if image is None:
image = IMAGE_BUSYBOX
Expand Down Expand Up @@ -49,6 +49,9 @@ def new_pod_manifest(pod_name="", image="", command=[], args=[],
}, {
'name': 'rancher',
'mountPath': '/var/lib/rancher'
}, {
'name': 'rootfs',
'mountPath': '/rootfs'
}]
}],
'volumes': [{
Expand All @@ -61,6 +64,11 @@ def new_pod_manifest(pod_name="", image="", command=[], args=[],
'hostPath': {
'path': '/var/lib/rancher'
}
}, {
'name': 'rootfs',
'hostPath': {
'path': '/'
}
}]
}
}
Expand Down
37 changes: 0 additions & 37 deletions e2e/templates/litmus/reboot-node.yaml

This file was deleted.

37 changes: 37 additions & 0 deletions e2e/tests/network_disconnect.robot
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
*** Settings ***
Documentation Negative Test Cases
Resource ../keywords/volume.resource
Resource ../keywords/statefulset.resource
Resource ../keywords/workload.resource
Resource ../keywords/common.resource
Resource ../keywords/network.resource

Test Setup Set test environment
Test Teardown Cleanup test resources

*** Variables ***
${LOOP_COUNT} 1
${LATENCY_IN_MS} 0
${RETRY_COUNT} 300
${RETRY_INTERVAL} 1

*** Test Cases ***
Disconnect Volume Node Network While Workload Heavy Writing
Given Create statefulset 0 using RWO volume
FOR ${i} IN RANGE ${LOOP_COUNT}
And Keep writing data to pod of statefulset 0
When Disconnect volume node network of statefulset 0 for 10 seconds
And Wait for volume of statefulset 0 healthy
And Wait for statefulset 0 pods stable
Then Check statefulset 0 works
END

Disconnect Volume Node Network For More Than Pod Eviction Timeout While Workload Heavy Writing
Given Create statefulset 0 using RWO volume
FOR ${i} IN RANGE ${LOOP_COUNT}
And Keep writing data to pod of statefulset 0
When Disconnect volume node network of statefulset 0 for 360 seconds
And Wait for volume of statefulset 0 healthy
And Wait for statefulset 0 pods stable
Then Check statefulset 0 works
END

0 comments on commit f83cce6

Please sign in to comment.