Skip to content

Commit

Permalink
fix: endless loop in connect.py (#628)
Browse files Browse the repository at this point in the history
* fix: endless loop in connect.py & minikube tests
  • Loading branch information
SteinRobert authored Jun 10, 2024
1 parent c816a35 commit 4138fc3
Show file tree
Hide file tree
Showing 15 changed files with 108 additions and 120 deletions.
37 changes: 26 additions & 11 deletions .github/workflows/python-tester.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ jobs:
tags: ${{ steps.operator_meta.outputs.tags }}
labels: ${{ steps.operator_meta.outputs.labels }}
cache-from: type=registry,ref=quay.io/gefyra/operator:latest
# Goutputs: type=docker,dest=/tmp/operator.tar
outputs: type=docker,dest=/tmp/operator.tar
- name: Upload operator image
uses: actions/upload-artifact@v4
with:
Expand Down Expand Up @@ -209,11 +209,14 @@ jobs:
include:
- os: ubuntu-latest
driver: ''
kubectl_os: linux
- os: macos-12
driver: virtualbox
kubectl_os: darwin
needs:
- build_operator
- build_cargo
- build_test_image
runs-on: ${{ matrix.os }}
steps:
- name: Checkout
Expand Down Expand Up @@ -248,16 +251,19 @@ jobs:
uses: actions/download-artifact@v4
with:
path: /tmp
- name: Load Gefyra images
run: |
docker load --input /tmp/pyserver/pyserver.tar
docker load --input /tmp/cargo/cargo.tar
docker image ls -a
- name: start minikube
id: minikube
uses: medyagh/setup-minikube@latest
with:
start-args: --ports 127.0.0.1:31820:31820/udp
- name: Load Gefyra images
run: |
eval $(minikube docker-env)
docker load --input /tmp/pyserver/pyserver.tar
docker load --input /tmp/cargo/cargo.tar
docker load --input /tmp/operator/operator.tar
docker tag ${{ needs.build_operator.outputs.tags }} quay.io/gefyra/operator:2.1.6
docker image ls -a
- name: Install dependencies
working-directory: ./client
run: poetry install --with dev --no-interaction --no-root
Expand All @@ -270,10 +276,17 @@ jobs:
working-directory: ./client
if: ${{ failure() }}
run: |
docker logs gefyra-cargo-default
kubectl logs -n gefyra gefyra-stowaway-0
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/${{ matrix.kubectl_os }}/amd64/kubectl"
chmod +x ./kubectl
./kubectl get pod -n gefyra
./kubectl logs -n gefyra gefyra-stowaway-0
./kubectl logs --previous -n gefyra deploy/gefyra-operator-webhook
./kubectl logs -n gefyra deploy/gefyra-operator-webhook
./kubectl describe pods -n gefyra -l gefyra.dev/role=webhook
./kubectl logs -n gefyra deploy/gefyra-operator
docker inspect minikube
docker inspect gefyra-cargo-default
docker logs gefyra-cargo-default
- name: Show coverage report
working-directory: ./client
run: |
Expand Down Expand Up @@ -350,10 +363,12 @@ jobs:
run: |
docker load --input /tmp/pyserver/pyserver.tar
docker load --input /tmp/cargo/cargo.tar
docker load --input /tmp/operator/operator.tar
docker image ls -a
# - name: Import Operator to k3d
# run: |
# k3d image import ${{ needs.build_operator.outputs.tags }} -c mycluster --verbose
docker tag ${{ needs.build_operator.outputs.tags }} quay.io/gefyra/operator:2.1.6
- name: Import Operator to k3d
run: |
k3d image import quay.io/gefyra/operator:2.1.6 -c mycluster --verbose
- name: Pytest
working-directory: ./client
run: poetry install --with dev --no-interaction --no-root
Expand Down
7 changes: 4 additions & 3 deletions client/gefyra/api/connect.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,16 +104,17 @@ def connect( # noqa: C901
config.CARGO_PROBE_TIMEOUT = probe_timeout

_retry = 0
while _retry < 5:
while _retry < 10:
gefyra_network = get_or_create_gefyra_network(config)
try:
client.activate_connection(
gefyra_network.attrs["IPAM"]["Config"][0]["Subnet"]
)
break
except kubernetes.client.exceptions.ApiException as e:
_retry += 1
if e.status == 500:
logger.debug(f"Could not activate connection, retrying {_retry}/5...")
logger.debug(f"Could not activate connection, retrying {_retry}/10...")
# if the given subnet is taken in the cluster (by another client), recreate the network and try again
# hopefully the IPAM config will give a new subnet
gefyra_network.remove()
Expand All @@ -127,7 +128,7 @@ def connect( # noqa: C901
break
else:
_i += 1
time.sleep(0.5)
time.sleep(1)
else:
raise GefyraConnectionError("Could not activate connection") from None
client.update()
Expand Down
11 changes: 9 additions & 2 deletions client/gefyra/api/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,20 +103,27 @@ def install(
from kubernetes.watch import Watch

w = Watch()

operator_ready = False
operator_webhook_ready = False
# block (forever) until Gefyra cluster side is ready
for event in w.stream(
config.K8S_CORE_API.list_namespaced_event, namespace=config.NAMESPACE
):
if event["object"].reason in ["Pulling", "Pulled"]:
logger.info(event["object"].message)
if event["object"].reason == "Gefyra-Ready":
operator_ready = True
logger.debug("Gefyra operator is ready")
if event["object"].reason == "Gefyra-Webhook-Ready":
operator_webhook_ready = True
logger.debug("Gefyra operator webhook is ready")
if operator_ready and operator_webhook_ready:
toc = time.perf_counter()
logger.info(f"Gefyra became ready in {toc - tic:0.4f} seconds")
break
# busywait for the operator webhook to become ready
_i = 0
while _i < 10:
while _i < 20:
webhook_deploy = config.K8S_APP_API.read_namespaced_deployment(
name="gefyra-operator-webhook", namespace=config.NAMESPACE
)
Expand Down
7 changes: 6 additions & 1 deletion client/gefyra/api/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def _get_cluster_status(config: ClientConfiguration) -> GefyraClusterStatus:
stowaway=False,
stowaway_image="",
namespace=False,
operator_webhook=False,
)
# check if connected to the cluster
try:
Expand Down Expand Up @@ -115,6 +116,7 @@ def _get_cluster_status(config: ClientConfiguration) -> GefyraClusterStatus:
and operator_deploy.status.ready_replicas >= 1
):
_status.operator = True
_status.operator_webhook = True
_status.operator_image = operator_deploy.spec.template.spec.containers[
0
].image
Expand Down Expand Up @@ -158,7 +160,10 @@ def status(connection_name: str = "") -> GefyraStatus:
summary = StatusSummary.UP
else:
if client.cargo or (
cluster.connected and cluster.operator and cluster.stowaway
cluster.connected
and cluster.operator
and cluster.stowaway
and cluster.operator_webhook
):
summary = StatusSummary.INCOMPLETE
else:
Expand Down
2 changes: 1 addition & 1 deletion client/gefyra/cli/updown.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def cluster_up(ctx, minikube: Optional[str] = None, preset: Optional[str] = None
bar.title = "Waiting for the Gefyra client to enter 'waiting' state"
# busy wait for the client to enter the waiting state
_i = 0
while _i < 10:
while _i < 20:
try:
json_str = api.write_client_file(
client_id=client.client_id,
Expand Down
5 changes: 3 additions & 2 deletions client/gefyra/misc/comps/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def data(params: "GefyraInstallOptions") -> list[dict]:
{
"name": "gefyra",
"image": f"quay.io/gefyra/operator:{params.version}",
"imagePullPolicy": "Always",
"imagePullPolicy": "IfNotPresent",
"ports": [{"containerPort": 9443}],
"env": [
{
Expand Down Expand Up @@ -89,7 +89,7 @@ def data(params: "GefyraInstallOptions") -> list[dict]:
{
"name": "gefyra",
"image": f"quay.io/gefyra/operator:{params.version}",
"imagePullPolicy": "Always",
"imagePullPolicy": "IfNotPresent",
"ports": [{"containerPort": 9443}],
"env": [
{"name": "OP_MODE", "value": "webhook"},
Expand All @@ -111,6 +111,7 @@ def data(params: "GefyraInstallOptions") -> list[dict]:
},
"initialDelaySeconds": 5,
"periodSeconds": 5,
"timeoutSeconds": 3,
},
}
],
Expand Down
2 changes: 2 additions & 0 deletions client/gefyra/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,8 @@ class GefyraClusterStatus:
stowaway_image: str
# the gefyra namespace is available
namespace: bool
# operator webhook
operator_webhook: bool


@dataclass
Expand Down
17 changes: 12 additions & 5 deletions operator/gefyra/clientstate.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ def create_service_account(self):
def on_enable(self):
self.logger.info(f"Client '{self.object_name}' is being enabled")

def on_activate(self):
self.logger.info(f"Client '{self.object_name}' is being activated")

def on_disable(self):
self.logger.info(f"Client '{self.object_name}' is being disabled")
self.cleanup_all_bridges()
Expand All @@ -150,10 +153,8 @@ def on_terminate(self):

def can_add_client(self):
if self.connection_provider.peer_exists(self.object_name):
self.logger.error(
f"Client '{self.object_name}' already exists, cannot enable connection."
)
return False
self.logger.warning(f"Client '{self.object_name}' already exists.")
return True
else:
self.connection_provider.add_peer(
self.object_name, self.data["providerParameter"]
Expand All @@ -167,7 +168,13 @@ def enable_connection(self):
raise kopf.TemporaryError(
f"Cannot read connection data from provider: {e}", delay=1
)
self._patch_object({"providerConfig": conn_data})
try:
self._patch_object({"providerConfig": conn_data})
except k8s.client.ApiException as e:
if e.status == 500:
raise kopf.TemporaryError(
f"Cannot enable connection: {e.reason}", delay=1
)

def disable_connection(self):
try:
Expand Down
4 changes: 3 additions & 1 deletion operator/gefyra/handler/clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ async def client_connection_changed(new, body, logger, **kwargs):
obj = GefyraClientObject(body)
client = GefyraClient(obj, configuration, logger)
# check if parameters for this connection provider have been added or removed
logger.info(f"Client is: {client.current_state}")
if bool(new):
# activate this connection
try:
Expand All @@ -37,7 +38,8 @@ async def client_connection_changed(new, body, logger, **kwargs):
logger.error(f"ApiException: {e}")
if e.status == 500:
raise kopf.TemporaryError(
f"Could not activate connection: {e}", delay=1
f"Could not activate connection: {e}, \nClient is {client.current_state}",
delay=1,
)
else:
# deactivate this connection
Expand Down
20 changes: 20 additions & 0 deletions operator/gefyra/handler/configure_webhook.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
from datetime import datetime

import kubernetes as k8s
import kopf

from gefyra.clientstate import GefyraClient
Expand All @@ -11,6 +12,13 @@
connection_provider_factory,
)

from gefyra.resources.events import create_operator_webhook_ready_event


logger = logging.getLogger(__name__)

events = k8s.client.EventsV1Api()


@kopf.on.startup()
def configure(settings: kopf.OperatorSettings, **_):
Expand All @@ -33,6 +41,18 @@ def configure(settings: kopf.OperatorSettings, **_):
@kopf.on.validate("gefyraclients.gefyra.dev", id="client-parameters") # type: ignore
def check_validate_provider_parameters(body, diff, logger, operation, **_):
if body.get("check", False):

def _write_startup_task() -> None:
try:
events.create_namespaced_event(
body=create_operator_webhook_ready_event(configuration.NAMESPACE),
namespace=configuration.NAMESPACE,
)
except k8s.client.exceptions.ApiException as e:
if e.status != 409:
logger.error("Could not create startup event: " + str(e))

_write_startup_task()
return True
name = body["metadata"]["name"]
logger.info(f"Validating provider parameters for GefyraClient {name}")
Expand Down
1 change: 1 addition & 0 deletions operator/gefyra/handler/startup.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ async def start_connection_providers(logger, retry, **kwargs) -> None:
"""
from gefyra.configuration import configuration

logger.info("Starting up connection providers")
not_ready_providers = []
for gefyra_connector in ConnectionProviderType:
provider = connection_provider_factory.get(
Expand Down
2 changes: 1 addition & 1 deletion operator/gefyra/healthcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@
timeout=2,
verify=False,
)
print(res.content)

assert res.status_code == 200
19 changes: 19 additions & 0 deletions operator/gefyra/resources/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,22 @@ def create_operator_ready_event(namespace: str) -> k8s.client.EventsV1Event:
kind="deployment", name="gefyra-operator", namespace=namespace
),
)


def create_operator_webhook_ready_event(namespace: str) -> k8s.client.EventsV1Event:
now = _get_now()
return k8s.client.EventsV1Event(
metadata=k8s.client.V1ObjectMeta(
name="gefyra-operator-webhook-startup", namespace=namespace
),
reason="Gefyra-Webhook-Ready",
note="Operator Webhook has been started configured successfully",
event_time=now,
action="Startup",
type="Normal",
reporting_instance="gefyra-operator-webhook",
reporting_controller="gefyra-operator-webhook",
regarding=k8s.client.V1ObjectReference(
kind="deployment", name="gefyra-operator-webhook", namespace=namespace
),
)
1 change: 1 addition & 0 deletions operator/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def operator(k3d, stowaway_image, carrier_image):
for key in list(sys.modules.keys()):
if key.startswith("kopf"):
del sys.modules[key]
operator.timeout = 10
operator.__exit__(None, None, None)


Expand Down
Loading

0 comments on commit 4138fc3

Please sign in to comment.