From 39b3b9dbb2df96ca1df6b6ae86bdbc63fc96f39e Mon Sep 17 00:00:00 2001
From: kegsay <kegan@matrix.org>
Date: Wed, 18 Oct 2023 14:23:40 +0100
Subject: [PATCH] Add COMPLEMENT_ENABLE_DIRTY_RUNS to vastly speed up test runs
 (#677)

* WIP: enable dirty runs

* Env var up dirty runs, fix up destroy function

* Play nicely with post-test scripts and always printing server logs

* Re-add in bad rebase
---
 .github/workflows/ci.yaml                     |  4 +--
 ENVIRONMENT.md                                |  9 +++--
 internal/config/config.go                     | 27 +++++++++++++--
 internal/docker/deployer.go                   |  6 ++++
 internal/docker/deployment.go                 | 18 ++++++++++
 test_package.go                               | 34 +++++++++++++++++--
 .../user_directory_display_names_test.go      |  4 ++-
 7 files changed, 92 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 238a1617..ee176302 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -43,13 +43,13 @@ jobs:
           - homeserver: Synapse
             tags: synapse_blacklist
             packages: ./tests/msc3874 ./tests/msc3902
-            env: "COMPLEMENT_SHARE_ENV_PREFIX=PASS_ PASS_SYNAPSE_COMPLEMENT_DATABASE=sqlite"
+            env: "COMPLEMENT_ENABLE_DIRTY_RUNS=1 COMPLEMENT_SHARE_ENV_PREFIX=PASS_ PASS_SYNAPSE_COMPLEMENT_DATABASE=sqlite"
             timeout: 20m
 
           - homeserver: Dendrite
             tags: dendrite_blacklist
             packages: ""
-            env: ""
+            env: "COMPLEMENT_ENABLE_DIRTY_RUNS=1"
             timeout: 10m
 
     steps:
diff --git a/ENVIRONMENT.md b/ENVIRONMENT.md
index a07c0bff..0dad97ed 100644
--- a/ENVIRONMENT.md
+++ b/ENVIRONMENT.md
@@ -4,7 +4,7 @@
 Complement is configured exclusively through the use of environment variables. These variables are described below.
 
 #### `COMPLEMENT_ALWAYS_PRINT_SERVER_LOGS`
-If 1, always prints the Homeserver container logs even on success.  
+If 1, always prints the Homeserver container logs even on success. When used with COMPLEMENT_ENABLE_DIRTY_RUNS, server logs are only printed once for reused deployments, at the very end of the test suite.  
 - Type: `bool`
 - Default: 0
 
@@ -21,6 +21,11 @@ If 1, prints out more verbose logging such as HTTP request/response bodies.
 - Type: `bool`
 - Default: 0
 
+#### `COMPLEMENT_ENABLE_DIRTY_RUNS`
+If 1, eligible tests will be provided with reusable deployments rather than a clean deployment. Eligible tests are tests run with `Deploy(t, numHomeservers)`. If enabled, COMPLEMENT_ALWAYS_PRINT_SERVER_LOGS and COMPLEMENT_POST_TEST_SCRIPT are run exactly once, at the end of all tests in the package. The post test script is run with the test name "COMPLEMENT_ENABLE_DIRTY_RUNS", and failed=false.  Enabling dirty runs can greatly speed up tests, at the cost of clear server logs and the chance of tests polluting each other. Tests using `OldDeploy` and blueprints will still have a fresh image for each test. Fresh images can still be desirable e.g user directory tests need a clean homeserver else search results can be polluted, tests which can blacklist a server over federation also need isolated deployments to stop failures impacting other tests. For these reasons, there will always be a way for a test to override this setting and get a dedicated deployment.  Eventually, dirty runs will become the default running mode of Complement, with an environment variable to disable this behaviour being added later, once this has stablised.  
+- Type: `bool`
+- Default: 0
+
 #### `COMPLEMENT_HOSTNAME_RUNNING_COMPLEMENT`
 The hostname of Complement from the perspective of a Homeserver running inside a container. This can be useful for container runtimes using another hostname to access the host from a container, like Podman that uses `host.containers.internal` instead.  
 - Type: `string`
@@ -35,7 +40,7 @@ A list of space separated blueprint names to not clean up after running. For exa
 - Type: `[]string`
 
 #### `COMPLEMENT_POST_TEST_SCRIPT`
-An arbitrary script to execute after a test was executed and before the container is removed. This can be used to extract, for example, server logs or database files. The script is passed the parameters: ContainerID, TestName, TestFailed (true/false)  
+An arbitrary script to execute after a test was executed and before the container is removed. This can be used to extract, for example, server logs or database files. The script is passed the parameters: ContainerID, TestName, TestFailed (true/false). When combined with COMPLEMENT_ENABLE_DIRTY_RUNS, the script is called exactly once at the end of the test suite, and is called with the TestName of "COMPLEMENT_ENABLE_DIRTY_RUNS" and TestFailed=false.  
 - Type: `string`
 - Default: ""
 
diff --git a/internal/config/config.go b/internal/config/config.go
index 033d964e..04b0f821 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -36,7 +36,9 @@ type Complement struct {
 	DebugLoggingEnabled bool
 	// Name: COMPLEMENT_ALWAYS_PRINT_SERVER_LOGS
 	// Default: 0
-	// Description: If 1, always prints the Homeserver container logs even on success.
+	// Description: If 1, always prints the Homeserver container logs even on success. When used with
+	// COMPLEMENT_ENABLE_DIRTY_RUNS, server logs are only printed once for reused deployments, at the very
+	// end of the test suite.
 	AlwaysPrintServerLogs bool
 	// Name: COMPLEMENT_SHARE_ENV_PREFIX
 	// Description: If set, all environment variables on the host with this prefix will be shared with
@@ -86,13 +88,33 @@ type Complement struct {
 	// like Podman that uses `host.containers.internal` instead.
 	HostnameRunningComplement string
 
+	// Name: COMPLEMENT_ENABLE_DIRTY_RUNS
+	// Default: 0
+	// Description: If 1, eligible tests will be provided with reusable deployments rather than a clean deployment.
+	// Eligible tests are tests run with `Deploy(t, numHomeservers)`. If enabled, COMPLEMENT_ALWAYS_PRINT_SERVER_LOGS
+	// and COMPLEMENT_POST_TEST_SCRIPT are run exactly once, at the end of all tests in the package. The post test script
+	// is run with the test name "COMPLEMENT_ENABLE_DIRTY_RUNS", and failed=false.
+	//
+	// Enabling dirty runs can greatly speed up tests, at the cost of clear server logs and the chance of tests
+	// polluting each other. Tests using `OldDeploy` and blueprints will still have a fresh image for each test.
+	// Fresh images can still be desirable e.g user directory tests need a clean homeserver else search results can
+	// be polluted, tests which can blacklist a server over federation also need isolated deployments to stop failures
+	// impacting other tests. For these reasons, there will always be a way for a test to override this setting and
+	// get a dedicated deployment.
+	//
+	// Eventually, dirty runs will become the default running mode of Complement, with an environment variable to
+	// disable this behaviour being added later, once this has stablised.
+	EnableDirtyRuns bool
+
 	HSPortBindingIP string
 
 	// Name: COMPLEMENT_POST_TEST_SCRIPT
 	// Default: ""
 	// Description: An arbitrary script to execute after a test was executed and before the container is removed.
 	// This can be used to extract, for example, server logs or database files. The script is passed the parameters:
-	// ContainerID, TestName, TestFailed (true/false)
+	// ContainerID, TestName, TestFailed (true/false). When combined with COMPLEMENT_ENABLE_DIRTY_RUNS, the script is
+	// called exactly once at the end of the test suite, and is called with the TestName of "COMPLEMENT_ENABLE_DIRTY_RUNS"
+	// and TestFailed=false.
 	PostTestScript string
 }
 
@@ -106,6 +128,7 @@ func NewConfigFromEnvVars(pkgNamespace, baseImageURI string) *Complement {
 	}
 	cfg.DebugLoggingEnabled = os.Getenv("COMPLEMENT_DEBUG") == "1"
 	cfg.AlwaysPrintServerLogs = os.Getenv("COMPLEMENT_ALWAYS_PRINT_SERVER_LOGS") == "1"
+	cfg.EnableDirtyRuns = os.Getenv("COMPLEMENT_ENABLE_DIRTY_RUNS") == "1"
 	cfg.EnvVarsPropagatePrefix = os.Getenv("COMPLEMENT_SHARE_ENV_PREFIX")
 	cfg.PostTestScript = os.Getenv("COMPLEMENT_POST_TEST_SCRIPT")
 	cfg.SpawnHSTimeout = time.Duration(parseEnvWithDefault("COMPLEMENT_SPAWN_HS_TIMEOUT_SECS", 30)) * time.Second
diff --git a/internal/docker/deployer.go b/internal/docker/deployer.go
index 3df382f3..6af22d74 100644
--- a/internal/docker/deployer.go
+++ b/internal/docker/deployer.go
@@ -148,6 +148,12 @@ func (d *Deployer) Deploy(ctx context.Context, blueprintName string) (*Deploymen
 	return dep, lastErr
 }
 
+func (d *Deployer) PrintLogs(dep *Deployment) {
+	for _, hsDep := range dep.HS {
+		printLogs(d.Docker, hsDep.ContainerID, hsDep.ContainerID)
+	}
+}
+
 // Destroy a deployment. This will kill all running containers.
 func (d *Deployer) Destroy(dep *Deployment, printServerLogs bool, testName string, failed bool) {
 	for _, hsDep := range dep.HS {
diff --git a/internal/docker/deployment.go b/internal/docker/deployment.go
index d27b9b4a..3847d61f 100644
--- a/internal/docker/deployment.go
+++ b/internal/docker/deployment.go
@@ -21,6 +21,8 @@ type Deployment struct {
 	Deployer *Deployer
 	// The name of the deployed blueprint
 	BlueprintName string
+	// Set to true if this deployment is a dirty deployment and so should not be destroyed.
+	Dirty bool
 	// A map of HS name to a HomeserverDeployment
 	HS               map[string]*HomeserverDeployment
 	Config           *config.Complement
@@ -52,10 +54,26 @@ func (hsDep *HomeserverDeployment) SetEndpoints(baseURL string, fedBaseURL strin
 	}
 }
 
+// DestroyAtCleanup destroys the entire deployment. It should be called at cleanup time for dirty
+// deployments only. Handles configuration options for things which should run at container destroy
+// time, like post-run scripts and printing logs.
+func (d *Deployment) DestroyAtCleanup() {
+	if !d.Dirty {
+		return
+	}
+	d.Deployer.Destroy(d, d.Deployer.config.AlwaysPrintServerLogs, "COMPLEMENT_ENABLE_DIRTY_RUNS", false)
+}
+
 // Destroy the entire deployment. Destroys all running containers. If `printServerLogs` is true,
 // will print container logs before killing the container.
 func (d *Deployment) Destroy(t *testing.T) {
 	t.Helper()
+	if d.Dirty {
+		if t.Failed() {
+			d.Deployer.PrintLogs(d)
+		}
+		return
+	}
 	d.Deployer.Destroy(d, d.Deployer.config.AlwaysPrintServerLogs || t.Failed(), t.Name(), t.Failed())
 }
 
diff --git a/test_package.go b/test_package.go
index 73ea52e3..0003120c 100644
--- a/test_package.go
+++ b/test_package.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"log"
 	"net/http"
+	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -49,6 +50,11 @@ type TestPackage struct {
 	complementBuilder *docker.Builder
 	// a counter to stop tests from allocating the same container name
 	namespaceCounter uint64
+
+	// pointers to existing deployments for Deploy(t, 1) style deployments which are reused when run
+	// in dirty mode.
+	existingDeployments   map[int]*docker.Deployment
+	existingDeploymentsMu *sync.Mutex
 }
 
 // NewTestPackage creates a new test package which can be used to deploy containers for all tests
@@ -69,13 +75,21 @@ func NewTestPackage(pkgNamespace string) (*TestPackage, error) {
 	logrus.SetLevel(logrus.ErrorLevel)
 
 	return &TestPackage{
-		complementBuilder: builder,
-		namespaceCounter:  0,
-		Config:            cfg,
+		complementBuilder:     builder,
+		namespaceCounter:      0,
+		Config:                cfg,
+		existingDeployments:   make(map[int]*docker.Deployment),
+		existingDeploymentsMu: &sync.Mutex{},
 	}, nil
 }
 
 func (tp *TestPackage) Cleanup() {
+	// any dirty deployments need logs printed and post scripts run
+	tp.existingDeploymentsMu.Lock()
+	for _, dep := range tp.existingDeployments {
+		dep.DestroyAtCleanup()
+	}
+	tp.existingDeploymentsMu.Unlock()
 	tp.complementBuilder.Cleanup()
 }
 
@@ -105,6 +119,14 @@ func (tp *TestPackage) OldDeploy(t *testing.T, blueprint b.Blueprint) Deployment
 
 func (tp *TestPackage) Deploy(t *testing.T, numServers int) Deployment {
 	t.Helper()
+	if tp.Config.EnableDirtyRuns {
+		tp.existingDeploymentsMu.Lock()
+		existingDep := tp.existingDeployments[numServers]
+		tp.existingDeploymentsMu.Unlock()
+		if existingDep != nil {
+			return existingDep
+		}
+	}
 	blueprint := mapServersToBlueprint(numServers)
 	timeStartBlueprint := time.Now()
 	if err := tp.complementBuilder.ConstructBlueprintIfNotExist(blueprint); err != nil {
@@ -121,6 +143,12 @@ func (tp *TestPackage) Deploy(t *testing.T, numServers int) Deployment {
 		t.Fatalf("Deploy: Deploy returned error %s", err)
 	}
 	t.Logf("Deploy times: %v blueprints, %v containers", timeStartDeploy.Sub(timeStartBlueprint), time.Since(timeStartDeploy))
+	if tp.Config.EnableDirtyRuns {
+		dep.Dirty = true // stop this deployment being destroyed.
+		tp.existingDeploymentsMu.Lock()
+		tp.existingDeployments[numServers] = dep
+		tp.existingDeploymentsMu.Unlock()
+	}
 	return dep
 }
 
diff --git a/tests/csapi/user_directory_display_names_test.go b/tests/csapi/user_directory_display_names_test.go
index ce358d9d..60da9819 100644
--- a/tests/csapi/user_directory_display_names_test.go
+++ b/tests/csapi/user_directory_display_names_test.go
@@ -9,6 +9,7 @@ import (
 	"testing"
 
 	"github.com/matrix-org/complement"
+	"github.com/matrix-org/complement/b"
 	"github.com/matrix-org/complement/client"
 	"github.com/matrix-org/complement/helpers"
 	"github.com/matrix-org/complement/match"
@@ -41,7 +42,8 @@ func setupUsers(t *testing.T) (*client.CSAPI, *client.CSAPI, *client.CSAPI, func
 	// - Eve knows about Alice,
 	// - Alice reveals a private name to another friend Bob
 	// - Eve shouldn't be able to see that private name via the directory.
-	deployment := complement.Deploy(t, 1)
+	// Use a clean deployment for these tests so the user directory isn't polluted.
+	deployment := complement.OldDeploy(t, b.BlueprintCleanHS)
 	cleanup := func(t *testing.T) {
 		deployment.Destroy(t)
 	}