backend: increase connection count before connect

There is a race between the time we connect to a backend and when we increase the connection count. This race is not normally observed, but when .max_connections property is used we could end up creating more connections to the backend than this limit permits. This is problematic as we end up consuming more system resources, e.g., file-descriptors, than reserved for such backends. The problem is that we release the backend lock and wait for the connect to finish. This can take upto connect_timeout, which has a default timeout of 3.5 seconds, and only then do we count the connection as consumed towards the .max_connections property. This can create a recipe for bursty behavior where a backend already struggling could be overwhelmed by a significant number of connections. Then, as the connection count goes higher than the ceiling we either fail fetches or queue them up. However, as soon as we once again drop below this limit we will again over-commit and the cycle repeats. The tests cases uses a blackhole backend to simulate the connect_timeout, this excersice the race but under normal circumstances the likelihood for the race to occur depend on the traffic (fetches), networking conditions, and the performance of the backend. The solution to the problem is to increase the connection count before connecting to the backend, and if we fail to get a connection to the backend we revert the count before failing the fetch. This will make any fethes past the .max_connections limit to either outright fail, or observe the presence of a queue to the backend. Signed-off-by: Asad Sajjad Ahmed <asadsa@varnish-software.com>
varnishcache · Sep 3, 2024 · 5cbb34f · 5cbb34f
1 parent f4bbb91
commit 5cbb34f
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 16 deletions.
diff --git a/bin/varnishd/cache/cache_backend.c b/bin/varnishd/cache/cache_backend.c
@@ -192,6 +192,7 @@ vbe_connwait_dequeue_locked(struct backend *bp, struct connwait *cw)
 {
 	Lck_AssertHeld(bp->director->mtx);
 	assert(cw->cw_state == CW_QUEUED);
+	bp->n_conn++;
 	VTAILQ_REMOVE(&bp->cw_head, cw, cw_list);
 	vbe_connwait_signal_locked(bp);
 	cw->cw_state = CW_DEQUEUED;
@@ -265,8 +266,10 @@ vbe_dir_getfd(VRT_CTX, struct worker *wrk, VCL_BACKEND dir, struct backend *bp,
 			VTAILQ_REMOVE(&bp->cw_head, cw, cw_list);
 			VSC_C_main->backend_wait_fail++;
 			cw->cw_state = CW_BE_BUSY;
-		}
-	}
+		} else
+			vbe_connwait_dequeue_locked(bp, cw);
+	} else if (cw->cw_state == CW_DO_CONNECT)
+		bp->n_conn++;
 	Lck_Unlock(bp->director->mtx);
 
 	if (cw->cw_state == CW_BE_BUSY) {
@@ -284,11 +287,10 @@ vbe_dir_getfd(VRT_CTX, struct worker *wrk, VCL_BACKEND dir, struct backend *bp,
 	if (bo->htc == NULL) {
 		VSLb(bo->vsl, SLT_FetchError, "out of workspace");
 		/* XXX: counter ? */
-		if (cw->cw_state == CW_QUEUED) {
-			Lck_Lock(bp->director->mtx);
-			vbe_connwait_dequeue_locked(bp, cw);
-			Lck_Unlock(bp->director->mtx);
-		}
+		Lck_Lock(bp->director->mtx);
+		bp->n_conn--;
+		vbe_connwait_signal_locked(bp);
+		Lck_Unlock(bp->director->mtx);
 		vbe_connwait_fini(cw);
 		return (NULL);
 	}
@@ -300,17 +302,14 @@ vbe_dir_getfd(VRT_CTX, struct worker *wrk, VCL_BACKEND dir, struct backend *bp,
 	if (pfd == NULL) {
 		Lck_Lock(bp->director->mtx);
 		VBE_Connect_Error(bp->vsc, err);
+		bp->n_conn--;
+		vbe_connwait_signal_locked(bp);
 		Lck_Unlock(bp->director->mtx);
 		VSLb(bo->vsl, SLT_FetchError,
 		     "backend %s: fail errno %d (%s)",
 		     VRT_BACKEND_string(dir), err, VAS_errtxt(err));
 		VSC_C_main->backend_fail++;
 		bo->htc = NULL;
-		if (cw->cw_state == CW_QUEUED) {
-			Lck_Lock(bp->director->mtx);
-			vbe_connwait_dequeue_locked(bp, cw);
-			Lck_Unlock(bp->director->mtx);
-		}
 		vbe_connwait_fini(cw);
 		return (NULL);
 	}
@@ -321,12 +320,8 @@ vbe_dir_getfd(VRT_CTX, struct worker *wrk, VCL_BACKEND dir, struct backend *bp,
 	assert(*fdp >= 0);
 
 	Lck_Lock(bp->director->mtx);
-	bp->n_conn++;
 	bp->vsc->conn++;
 	bp->vsc->req++;
-	if (cw->cw_state == CW_QUEUED)
-		vbe_connwait_dequeue_locked(bp, cw);
-
 	Lck_Unlock(bp->director->mtx);
 
 	CHECK_OBJ_NOTNULL(bo->htc->doclose, STREAM_CLOSE_MAGIC);

diff --git a/bin/varnishtest/tests/b00089.vtc b/bin/varnishtest/tests/b00089.vtc
@@ -0,0 +1,46 @@
+varnishtest "connect: blackhole"
+
+# XXX: This test case relies on a way for us to never successed in connecting
+# to a backend. We need the connect syscall to hang forever. For TCP this means
+# that the backend must not respond to the three-way handshake initiated by
+# Varnish.
+#
+# One way to achieve this is to use an IP address non-routeable over the
+# Internet, but this is not always a bullet-proof solution across all the
+# platforms we support. Another downside with this solution is that it sends
+# traffic over the local network.
+#
+# The current test case seems to work on a few supported platforms.
+# Comment the line below to enable this test.
+
+feature cmd false
+
+varnish v1 -vcl {
+	backend default {
+		# Non-routeable IPv4 address to simulate connect_timeout
+		# XXX: $\{blackhole_backend}
+		.host = "10.255.255.1 80";
+		.max_connections = 1;
+	}
+} -start
+
+varnish v1 -cli "param.set connect_timeout 2"
+
+client c1 {
+	txreq -url "/blackhole"
+	rxresp
+	expect resp.status == 503
+} -start
+
+client c2 {
+	txreq -url "/bye"
+	rxresp
+	expect resp.status == 503
+} -start
+
+client c1 -wait
+client c2 -wait
+
+varnish v1 -expect MAIN.backend_conn == 0
+varnish v1 -expect MAIN.backend_busy == 1
+varnish v1 -expect MAIN.backend_fail == 1
diff --git a/bin/varnishtest/tests/b00090.vtc b/bin/varnishtest/tests/b00090.vtc
@@ -0,0 +1,49 @@
+varnishtest "connect: blackhole with queueing"
+
+# XXX: This test case relies on a way for us to never successed in connecting
+# to a backend. We need the connect syscall to hang forever. For TCP this means
+# that the backend must not respond to the three-way handshake initiated by
+# Varnish.
+#
+# One way to achieve this is to use an IP address non-routeable over the
+# Internet, but this is not always a bullet-proof solution across all the
+# platforms we support. Another downside with this solution is that it sends
+# traffic over the local network.
+#
+# The current test case seems to work on a few supported platforms.
+# Comment the line below to enable this test.
+
+feature cmd false
+
+varnish v1 -vcl {
+	backend default {
+		# Non-routeable IPv4 address to simulate connect_timeout
+		# XXX: $\{blackhole_backend}
+		.host = "10.255.255.1 80";
+		.max_connections = 1;
+		.wait_limit = 1;
+		.wait_timeout = 0.2s;
+	}
+} -start
+
+varnish v1 -cli "param.set connect_timeout 2"
+
+client c1 {
+	txreq -url "/blackhole"
+	rxresp
+	expect resp.status == 503
+} -start
+
+client c2 {
+	txreq -url "/bye"
+	rxresp
+	expect resp.status == 503
+} -start
+
+client c1 -wait
+client c2 -wait
+
+varnish v1 -expect MAIN.backend_conn == 0
+varnish v1 -expect MAIN.backend_busy == 1
+varnish v1 -expect MAIN.backend_wait_fail == 1
+varnish v1 -expect MAIN.backend_fail == 1