Skip to content

Commit

Permalink
Merge pull request #923 from barakda/squid
Browse files Browse the repository at this point in the history
Squid branch update
  • Loading branch information
barakda authored Oct 28, 2024
2 parents fdfbb95 + 442e0ad commit 931d77e
Show file tree
Hide file tree
Showing 10 changed files with 141 additions and 18 deletions.
6 changes: 3 additions & 3 deletions .env
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Globals
VERSION="1.3.2"
CEPH_VERSION="18.2.4"
SPDK_VERSION="24.01"
VERSION="1.3.3"
CEPH_VERSION="19.2.0"
SPDK_VERSION="24.09"
CONTAINER_REGISTRY="quay.io/ceph"
QUAY_SPDK="${CONTAINER_REGISTRY}/spdk"
QUAY_CEPH="${CONTAINER_REGISTRY}/vstart-cluster"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build-container.yml
Original file line number Diff line number Diff line change
Expand Up @@ -737,7 +737,7 @@ jobs:
strategy:
fail-fast: false
matrix:
test: ["sanity", "ns_lb_change", "state_transitions", "state_transitions_both_gws", "state_transitions_loop", "state_transitions_rand_loop", "late_registration", "late_registration_loop", "4gws", "4gws_loop", "4gws_create_delete", "4gws_create_delete_loop", "namespaces", "namespaces_loop", "mtls", "notify", "ceph_status"]
test: ["sanity", "ns_lb_change", "state_transitions", "state_transitions_both_gws", "state_transitions_loop", "state_transitions_rand_loop", "late_registration", "late_registration_loop", "4gws", "4gws_loop", "4gws_create_delete", "4gws_create_delete_loop", "namespaces", "namespaces_loop", "mtls", "notify", "ceph_status", "main_exit"]
runs-on: ubuntu-latest
env:
HUGEPAGES: 1024 # 4 spdk instances
Expand Down
14 changes: 3 additions & 11 deletions control/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,10 @@
#

import argparse
import signal
from .server import GatewayServer
from .config import GatewayConfig
from .utils import GatewayLogger

def sigterm_handler(signum, frame):
raise SystemExit(f"Gateway process terminated")

if __name__ == '__main__':
parser = argparse.ArgumentParser(prog="python3 -m control",
description="Manage NVMe gateways",
Expand All @@ -32,10 +28,6 @@ def sigterm_handler(signum, frame):
gw_logger = GatewayLogger(config)
config.display_environment_info(gw_logger.logger)
config.dump_config_file(gw_logger.logger)
try:
with GatewayServer(config) as gateway:
signal.signal(signal.SIGTERM, sigterm_handler)
gateway.serve()
gateway.keep_alive()
except SystemExit:
pass
with GatewayServer(config) as gateway:
gateway.serve()
gateway.keep_alive()
13 changes: 12 additions & 1 deletion control/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@
from .cephutils import CephUtils
from .prometheus import start_exporter

def sigterm_handler(signum, frame):
"""Handle SIGTERM, runs when a gateway is terminated gracefully."""
logger = GatewayLogger().logger
logger.info(f"GatewayServer: SIGTERM received {signum=}")
raise SystemExit(0)

def sigchld_handler(signum, frame):
"""Handle SIGCHLD, runs when a child process, like the spdk, terminates."""
logger = GatewayLogger().logger
Expand Down Expand Up @@ -179,6 +185,9 @@ def serve(self):
# install SIGCHLD handler
signal.signal(signal.SIGCHLD, sigchld_handler)

# install SIGTERM handler
signal.signal(signal.SIGTERM, sigterm_handler)

# Start monitor client
self._start_monitor_client()

Expand Down Expand Up @@ -574,7 +583,9 @@ def keep_alive(self):
consecutive_ping_failures += 1
if consecutive_ping_failures >= allowed_consecutive_spdk_ping_failures:
self.logger.critical(f"SPDK ping failed {consecutive_ping_failures} times, aborting")
break
raise SystemExit(f"SPDK ping failed, quitting gateway")
else:
self.logger.warning(f"SPDK ping failed {consecutive_ping_failures} times, will keep trying")
else:
consecutive_ping_failures = 0

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ build-backend = "pdm.backend"

[project]
name = "ceph-nvmeof"
version = "1.3.1"
version = "1.3.3"

description = "Service to provide Ceph storage over NVMe-oF protocol"
readme = "README.md"
requires-python = "~=3.9"
Expand Down
111 changes: 111 additions & 0 deletions tests/ha/main_exit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/bin/sh
set -xe
SCALE=1
POOL="${RBD_POOL:-rbd}"

background_task() {

# Give gateway some time
sleep 5

# Waiting for the ceph container to become healthy
while true; do
container_status=$(docker inspect --format='{{.State.Health.Status}}' ceph)
if [ "$container_status" = "healthy" ]; then
# success
break
else
# Wait for a specific time before checking again
sleep 1
printf .
fi
done
echo ✅ ceph is healthy

echo ℹ️ Running processes of services
docker compose top

echo ℹ️ Send nvme-gw create for all gateways
GW_GROUP=''
i=1 # a single gw index
GW_NAME=$(docker ps --format '{{.ID}}\t{{.Names}}' | grep -v discovery | awk '$2 ~ /nvmeof/ && $2 ~ /'$i'/ {print $1}')
echo 📫 nvme-gw create gateway: \'$GW_NAME\' pool: \'$POOL\', group: \'$GW_GROUP\'
docker compose exec -T ceph ceph nvme-gw create $GW_NAME $POOL "$GW_GROUP"

echo ℹ️ Wait for gateway to be ready
while true; do
sleep 1 # Adjust the sleep duration as needed
container_status=$(docker inspect -f '{{.State.Status}}' "$GW_NAME")
if [ "$container_status" == "running" ]; then
echo "Container $i $GW_NAME is now running."
else
echo "Container $i $GW_NAME is still not running. Waiting..."
continue
fi
GW_IP="$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$GW_NAME")"
if docker compose run --rm nvmeof-cli $CLI_TLS_ARGS --server-address $GW_IP --server-port 5500 get_subsystems 2>&1 | grep -i failed; then
echo "Container $i $GW_NAME $GW_IP no subsystems. Waiting..."
continue
fi
echo "Container $i $GW_NAME $GW_IP subsystems:"
docker compose run --rm nvmeof-cli $CLI_TLS_ARGS --server-address $GW_IP --server-port 5500 get_subsystems
break
done

# Signal to send (e.g., SIGTERM or SIGKILL)
SIGNAL="SIGABRT"

# Get the PID of monitor_client inside the container
PID=$(docker exec "$GW_NAME" sh -c "for pid in /proc/*; do
if [ -f \"\$pid/comm\" ] && grep -q 'ceph-nvmeof-mon' \"\$pid/comm\"; then
echo \$(basename \$pid)
break
fi
done")

if [ -n "$PID" ]; then
echo "ℹ️ Sending $SIGNAL to monitor_client (PID: $PID) in $GW_NAME..."
docker exec "$GW_NAME" kill -s "$SIGNAL" "$PID"
else
echo "❌ monitor_client process not found in $GW_NAME."
exit 1
fi

}

##
## MAIN
##

background_task &
TASK_PID=$! # Capture the PID of the background task

echo ℹ️ Starting $SCALE nvmeof gateways
docker compose up --remove-orphans --scale nvmeof=$SCALE nvmeof
GW_NAME=$(docker ps -a --format '{{.ID}}\t{{.Names}}' | grep -v discovery | awk '$2 ~ /nvmeof/ && $2 ~ /'$i'/ {print $1}')
docker inspect "$GW_NAME"
exit_code=$(docker inspect --format='{{.State.ExitCode}}' "$GW_NAME")

# expect exit code 1
if [ $exit_code -eq 1 ]; then
echo ✅ gateway returned exit code 1, exiting with success.
else
echo ❌ gateway returned exit code $exit_code, exiting with failure.
exit 1 # Failure exit code
fi

# Wait for the background task to finish
wait $TASK_PID # Wait for the specific PID to complete
background_task_exit_code=$? # Capture the exit code of the background task

# Check the exit code and print the result
if [ $background_task_exit_code -eq 0 ]; then
echo ✅ background task completed successfully
else
echo ❌ background task failed with exit code: $background_task_exit_code
fi

# Exit with the same code as the background task
exit $background_task_exit_code


2 changes: 2 additions & 0 deletions tests/ha/setup_main_exit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
set -e
echo ℹ️ Skipping setup for this test
2 changes: 2 additions & 0 deletions tests/ha/start_up_main_exit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
set -e
echo ℹ️ Skipping start up for this test
4 changes: 3 additions & 1 deletion tests/ha/wait_gateways.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/sh
set -ex
SCALE=2
echo CLI_TLS_ARGS $CLI_TLS_ARGS
# Check if argument is provided
Expand All @@ -16,7 +18,7 @@ for i in $(seq $SCALE); do
sleep 1 # Adjust the sleep duration as needed
GW_NAME=$(docker ps --format '{{.ID}}\t{{.Names}}' | awk '$2 ~ /nvmeof/ && $2 ~ /'$i'/ {print $1}')
container_status=$(docker inspect -f '{{.State.Status}}' "$GW_NAME")
if [ "$container_status" == "running" ]; then
if [ "$container_status" = "running" ]; then
echo "Container $i $GW_NAME is now running."
else
echo "Container $i $GW_NAME is still not running. Waiting..."
Expand Down
2 changes: 2 additions & 0 deletions tests/ha/wait_gateways_main_exit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
set -e
echo ℹ️ Skipping wait gateways up for this test

0 comments on commit 931d77e

Please sign in to comment.