Description
While exploring more on issue #276, it was noticed that the idle connections are somehow stuck in a weird state where the connection is established i.e the kube-apiserver receives a DIAL_RSP from the proxy-server, but then thereafter there is complete inactivity from the apiserver i.e no packets of PacketType_DATA are sent.
In order to reproduce this, I added some debug statements to the konnectivity-client
library which is vendored in Kubernetes and built a local cluster.
This is the diff in the kubernetes repo to build a local cluster
diff --git a/hack/local-up-cluster.sh b/hack/local-up-cluster.sh
index abe85bb29fc..49b526ef080 100755
--- a/hack/local-up-cluster.sh
+++ b/hack/local-up-cluster.sh
@@ -553,7 +553,20 @@ kind: EgressSelectorConfiguration
egressSelections:
- name: cluster
connection:
- proxyProtocol: Direct
+ # This controls the protocol between the API Server and the Konnectivity
+ # server. Supported values are "GRPC" and "HTTPConnect". There is no
+ # end user visible difference between the two modes. You need to set the
+ # Konnectivity server to work in the same mode.
+ proxyProtocol: GRPC
+ transport:
+ # This controls what transport the API Server uses to communicate with the
+ # Konnectivity server. UDS is recommended if the Konnectivity server
+ # locates on the same machine as the API Server. You need to configure the
+ # Konnectivity server to listen on the same UDS socket.
+ # The other supported transport is "tcp". You will need to set up TLS
+ # config to secure the TCP transport.
+ uds:
+ udsName: /tmp/uds-proxy
- name: controlplane
connection:
proxyProtocol: Direct
@@ -617,6 +630,7 @@ EOF
--requestheader-allowed-names=system:auth-proxy \
--proxy-client-cert-file="${CERT_DIR}/client-auth-proxy.crt" \
--proxy-client-key-file="${CERT_DIR}/client-auth-proxy.key" \
+ --api-audiences=https://kubernetes.default.svc,system:konnectivity-server \
--cors-allowed-origins="${API_CORS_ALLOWED_ORIGINS}" >"${APISERVER_LOG}" 2>&1 &
APISERVER_PID=$!
diff --git a/vendor/sigs.k8s.io/apiserver-network-proxy/konnectivity-client/pkg/client/client.go b/vendor/sigs.k8s.io/apiserver-network-proxy/konnectivity-client/pkg/client/client.go
index 31bf9e11fe5..82880b2f997 100644
--- a/vendor/sigs.k8s.io/apiserver-network-proxy/konnectivity-client/pkg/client/client.go
+++ b/vendor/sigs.k8s.io/apiserver-network-proxy/konnectivity-client/pkg/client/client.go
@@ -109,6 +109,7 @@ func (t *grpcTunnel) serve(c clientConn) {
switch pkt.Type {
case client.PacketType_DIAL_RSP:
resp := pkt.GetDialResponse()
+ fmt.Println("DEBUG - Receive PacketType_DIAL_RSP for connection ID=", resp.ConnectID)
t.pendingDialLock.RLock()
ch, ok := t.pendingDial[resp.Random]
t.pendingDialLock.RUnlock()
@@ -120,7 +121,7 @@ func (t *grpcTunnel) serve(c clientConn) {
err: resp.Error,
connid: resp.ConnectID,
}
- select {
+ select {
case ch <- result:
default:
klog.ErrorS(fmt.Errorf("blocked pending channel"), "Received second dial response for connection request", "connectionID", resp.ConnectID, "dialID", resp.Random)
@@ -136,6 +137,7 @@ func (t *grpcTunnel) serve(c clientConn) {
case client.PacketType_DATA:
resp := pkt.GetData()
+ fmt.Println("DEBUG - Receive PacketType_DATA for connection ID=", resp.ConnectID)
// TODO: flow control
t.connsLock.RLock()
conn, ok := t.conns[resp.ConnectID]
@@ -155,6 +157,7 @@ func (t *grpcTunnel) serve(c clientConn) {
}
case client.PacketType_CLOSE_RSP:
resp := pkt.GetCloseResponse()
+ fmt.Println("DEBUG - Receive PacketType_CLOSE_RSP for connection ID=", resp.ConnectID)
t.connsLock.RLock()
conn, ok := t.conns[resp.ConnectID]
t.connsLock.RUnlock()
@@ -211,7 +214,7 @@ func (t *grpcTunnel) DialContext(ctx context.Context, protocol, address string)
klog.V(5).Infoln("DIAL_REQ sent to proxy server")
c := &conn{stream: t.stream, random: random}
-
+ fmt.Println("DEBUG - Send PacketType_DIAL_REQ")
select {
case res := <-resCh:
if res.err != "" {
diff --git a/vendor/sigs.k8s.io/apiserver-network-proxy/konnectivity-client/pkg/client/conn.go b/vendor/sigs.k8s.io/apiserver-network-proxy/konnectivity-client/pkg/client/conn.go
index cc6e66be237..acf0ee7ce61 100644
--- a/vendor/sigs.k8s.io/apiserver-network-proxy/konnectivity-client/pkg/client/conn.go
+++ b/vendor/sigs.k8s.io/apiserver-network-proxy/konnectivity-client/pkg/client/conn.go
@@ -18,6 +18,7 @@ package client
import (
"errors"
+ "fmt"
"io"
"net"
"time"
@@ -61,6 +62,7 @@ func (c *conn) Write(data []byte) (n int, err error) {
if err != nil {
return 0, err
}
+ fmt.Println("DEBUG - Send PacketType_DATA for connection ID=", c.connID)
return len(data), err
}
@@ -116,6 +118,7 @@ func (c *conn) Close() error {
klog.V(4).Infoln("closing connection")
var req *client.Packet
if c.connID != 0 {
+ fmt.Println("DEBUG - Send PacketType_CLOSE_REQ for connection ID=", c.connID)
req = &client.Packet{
Type: client.PacketType_CLOSE_REQ,
Payload: &client.Packet_CloseRequest{
Once the local cluster is up with ./hack/local-up-cluster.sh
, you can start the proxy-server and agent as local processes in separate terminals with:
make clean && make certs && make gen && make build
./bin/proxy-server --server-port=0 --uds-name=/tmp/uds-proxy --cluster-ca-cert=certs/agent/issued/ca.crt --cluster-cert=certs/agent/issued/proxy-frontend.crt --cluster-key=certs/agent/private/proxy-frontend.key --warn-on-channel-limit --v 5 --keepalive-time 1m --frontend-keepalive-time 1m --enable-profiling 2>&1 | tee server.log
./bin/proxy-agent --ca-cert=certs/agent/issued/ca.crt --agent-cert=certs/agent/issued/proxy-agent.crt --agent-key=certs/agent/private/proxy-agent.key
Stress the cluster, with concurrent kubectl log requests
Create some workloads
kubectl create deployment nginx --image nginx --replicas 20
get-logs.sh script:
#!/bin/bash
NAMESPACE=${NAMESPACE:-"default"}
pods=$(kubectl get pods -n ${NAMESPACE} --no-headers | awk '{print $1}')
FLOOR=1;
arr=(`echo ${pods}`);
CEILING=${#arr[@]};
for i in {1..3}
do
RANGE=$(($CEILING-$FLOOR));
random=$RANDOM;
let "random %= $RANGE";
random=$(($random+$FLOOR));
kubectl logs -n ${NAMESPACE} ${arr[${random}]} > /dev/null 2> /dev/null
done
run.sh script:
#!/bin/bash
set -euo pipefail
for i in {1..500}
do
./get-logs.sh &
done
Execute the run.sh script after saving and giving execute permissions:
./run.sh
To verify the above hypothesis, I've created a small script, that prints those connection IDs which are idle i.e CLOSE_REQ
has not been sent by the api-server and hence as a result never received a CLOSE_RSP
from the proxy-server.
check.sh script:
#!/bin/bash
set -eou pipefail
total_connection_ids=$(rg "DEBUG - Send PacketType_DIAL_REQ" /tmp/kube-apiserver.log | wc -l)
for connection_id in $(seq 1 $total_connection_ids)
do
close_req="DEBUG - Send PacketType_CLOSE_REQ for connection ID= ${connection_id}"
close_rsp="DEBUG - Receive PacketType_CLOSE_RSP for connection ID= ${connection_id}"
req_status=0
set +e
rg "$close_req\$" /tmp/kube-apiserver.log > /dev/null || req_status=1
res_status=0
rg "$close_rsp\$" /tmp/kube-apiserver.log > /dev/null || res_status=1
set -e
if [[ ( $req_status == 1 && $res_status == 1 ) ]]; then
echo "Both missing for $connection_id"
fi
if [[ ( $req_status == 0 && $res_status == 1 ) ]]; then
echo "CloseResponse missing for $connection_id"
fi
if [[ ( $req_status == 1 && $res_status == 0 ) ]]; then
echo "Interesting case for $connection_id"
fi
done
We are interested in the case of Both missing from $connection_id
. All of the connection ids, that are printed for this case if cross-checked against the proxy-server logs have only the following logs snippet, example connectionID=505
I1215 16:36:06.122100 15067 server.go:714] "Received DIAL_RSP" dialID=4406796995779167332 agentID="f902ce27-b9b6-44a6-aef7-cfad5aff3611" connectionID=505
I1215 16:36:06.122132 15067 server.go:269] "Register frontend for agent" frontend=&{Mode:grpc Grpc:0xc000ee5f80 HTTP:<nil> CloseHTTP:<nil> connected:0xc000c58840 connectID:505 agentID:f902ce27-b9b6-44a6-aef7-cfad5aff3611 start:{wall:13863927826982877159 ext:149658304691 loc:0x212d6e0} backend:0xc000616108} agentID="f902ce27-b9b6-44a6-aef7-cfad5aff3611" connectionID=505
You can also verify if PacketType_DATA
is sent by the apiserver for the same connection id
grep "DEBUG - Send PacketType_DATA for connection ID=" /tmp/kube-apiserver.log | grep 505
The problematic aspect of this is that in order to close/terminate these idle connections, we need connectionID which is only received when DIAL_RSP is sent by the agent and received by proxy-server and when PacketType_DATA is sent by apiserver to proxy-server.