Add functional test for product telemetry

pleshakov · pleshakov · commit 04b783b357c8 · 2024-03-08T18:06:05.000-05:00
Problem: Ensure product telemetry feature is tested with a functional test Solution: - Add a functional test. - Because it requires a NGF with a custom built, it only runs when ginkgo runs with its label. Testing: Ran successfully: - make test TAG=$(whoami) GINKGO_LABEL=telemetry PLUS_ENABLED=true - make test TAG=$(whoami) GINKGO_LABEL=telemetry - make test TAG=$(whoami) # here telemetry test was skipped, but all the rest ran successfully ClOSES - #1640
diff --git a/tests/Makefile b/tests/Makefile
@@ -13,6 +13,8 @@ GINKGO_LABEL=
 GINKGO_FLAGS=
 NGF_VERSION=
 CI=false
+TELEMETRY_ENDPOINT=
+TELEMETRY_ENDPOINT_INSECURE=
 
 ifneq ($(GINKGO_LABEL),)
     override GINKGO_FLAGS += -ginkgo.label-filter "$(GINKGO_LABEL)"
@@ -34,11 +36,11 @@ create-kind-cluster: ## Create a kind cluster
 
 .PHONY: build-images
 build-images: ## Build NGF and NGINX images
-	cd .. && make PREFIX=$(PREFIX) TAG=$(TAG) build-images
+	cd .. && make PREFIX=$(PREFIX) TAG=$(TAG) TELEMETRY_ENDPOINT=$(TELEMETRY_ENDPOINT) TELEMETRY_ENDPOINT_INSECURE=$(TELEMETRY_ENDPOINT_INSECURE) build-images
 
 .PHONY: build-images-with-plus
 build-images-with-plus: ## Build NGF and NGINX Plus images
-	cd .. && make PREFIX=$(PREFIX) TAG=$(TAG) build-images-with-plus
+	cd .. && make PREFIX=$(PREFIX) TAG=$(TAG) TELEMETRY_ENDPOINT=$(TELEMETRY_ENDPOINT) TELEMETRY_ENDPOINT_INSECURE=$(TELEMETRY_ENDPOINT_INSECURE) build-images-with-plus
 
 .PHONY: load-images
 load-images: ## Load NGF and NGINX images on configured kind cluster
@@ -48,6 +50,32 @@ load-images: ## Load NGF and NGINX images on configured kind cluster
 load-images-with-plus: ## Load NGF and NGINX Plus images on configured kind cluster
 	cd .. && make PREFIX=$(PREFIX) TAG=$(TAG) load-images-with-plus
 
+.PHONY: update-ngf-manifest
+update-ngf-manifest: ## Update the NGF deployment manifest image names and imagePullPolicies
+	cd .. \
+    && make generate-manifests HELM_TEMPLATE_COMMON_ARGS="\
+    --set nginxGateway.image.repository=$(PREFIX) \
+    --set nginxGateway.image.tag=$(TAG) \
+    --set nginxGateway.image.pullPolicy=Never \
+    --set nginx.image.repository=$(NGINX_PREFIX) \
+    --set nginx.image.tag=$(TAG) \
+    --set nginx.image.pullPolicy=Never" \
+    && cd -
+
+.PHONY: update-ngf-manifest-with-plus
+update-ngf-manifest-with-plus: ## Update the NGF deployment manifest image names and imagePullPolicies including nginx-plus
+	cd .. \
+    && make generate-manifests HELM_TEMPLATE_COMMON_ARGS="\
+    --set nginxGateway.image.repository=$(PREFIX) \
+    --set nginxGateway.image.tag=$(TAG) \
+    --set nginxGateway.image.pullPolicy=Never \
+    --set nginx.image.repository=$(NGINX_PLUS_PREFIX) \
+    --set nginx.image.tag=$(TAG) \
+    --set nginx.image.pullPolicy=Never \
+    --set nginx.plus=true" \
+    && cd -
+
+
 test: ## Run the system tests against your default k8s cluster
 	go test -v ./suite $(GINKGO_FLAGS) -args --gateway-api-version=$(GW_API_VERSION) \
 		--gateway-api-prev-version=$(GW_API_PREV_VERSION) --image-tag=$(TAG) --version-under-test=$(NGF_VERSION) \
diff --git a/tests/README.md b/tests/README.md
@@ -55,24 +55,29 @@ load-images                    Load NGF and NGINX images on configured kind clus
 run-tests-on-vm                Run the tests on a GCP VM
 setup-gcp-and-run-tests        Create and setup a GKE router and GCP VM for tests and run the tests
 test                           Run the system tests against your default k8s cluster
+update-ngf-manifest-with-plus  Update the NGF deployment manifest image names and imagePullPolicies including nginx-plus
+update-ngf-manifest            Update the NGF deployment manifest image names and imagePullPolicies
 ```
 
 **Note:** The following variables are configurable when running the below `make` commands:
 
-| Variable            | Default                         | Description                                                    |
-| ------------------- | ------------------------------- | -------------------------------------------------------------- |
-| TAG                 | edge                            | tag for the locally built NGF images                           |
-| PREFIX              | nginx-gateway-fabric            | prefix for the locally built NGF image                         |
-| NGINX_PREFIX        | nginx-gateway-fabric/nginx      | prefix for the locally built NGINX image                       |
-| NGINX_PLUS_PREFIX   | nginx-gateway-fabric/nginx-plus | prefix for the locally built NGINX Plus image                  |
-| PLUS_ENABLED        | false                           | Flag to indicate if NGINX Plus should be enabled               |
-| PULL_POLICY         | Never                           | NGF image pull policy                                          |
-| GW_API_VERSION      | 1.0.0                           | version of Gateway API resources to install                    |
-| K8S_VERSION         | latest                          | version of k8s that the tests are run on                       |
-| GW_SERVICE_TYPE     | NodePort                        | type of Service that should be created                         |
-| GW_SVC_GKE_INTERNAL | false                           | specifies if the LoadBalancer should be a GKE internal service |
-| GINKGO_LABEL        | ""                              | name of the ginkgo label that will filter the tests to run     |
-| GINKGO_FLAGS        | ""                              | other ginkgo flags to pass to the go test command              |
+| Variable                     | Default                         | Description                                                         |
+|------------------------------|---------------------------------|---------------------------------------------------------------------|
+| TAG                          | edge                            | tag for the locally built NGF images                                |
+| PREFIX                       | nginx-gateway-fabric            | prefix for the locally built NGF image                              |
+| NGINX_PREFIX                 | nginx-gateway-fabric/nginx      | prefix for the locally built NGINX image                            |
+| NGINX_PLUS_PREFIX            | nginx-gateway-fabric/nginx-plus | prefix for the locally built NGINX Plus image                       |
+| PLUS_ENABLED                 | false                           | Flag to indicate if NGINX Plus should be enabled                    |
+| PULL_POLICY                  | Never                           | NGF image pull policy                                               |
+| GW_API_VERSION               | 1.0.0                           | version of Gateway API resources to install                         |
+| K8S_VERSION                  | latest                          | version of k8s that the tests are run on                            |
+| GW_SERVICE_TYPE              | NodePort                        | type of Service that should be created                              |
+| GW_SVC_GKE_INTERNAL          | false                           | specifies if the LoadBalancer should be a GKE internal service      |
+| GINKGO_LABEL                 | ""                              | name of the ginkgo label that will filter the tests to run          |
+| GINKGO_FLAGS                 | ""                              | other ginkgo flags to pass to the go test command                   |
+| TELEMETRY_ENDPOINT           | Set in the main Makefile        | The endpoint to which telemetry reports are sent                    |
+| TELEMETRY_ENDPOINT_INSECURE= | Set in the main Makefile        | Controls whether TLS should be used when sending telemetry reports. |
+
 
 ## Step 1 - Create a Kubernetes cluster
 
@@ -126,7 +131,27 @@ Or, to build NGF with NGINX Plus enabled (NGINX Plus cert and key must exist in
 make build-images-with-plus load-images-with-plus TAG=$(whoami)
 ```
 
-## Step 3 - Run the tests
+For the telemetry test, which requires a OTel collector, build an image with the following variables set:
+
+```makefile
+TELEMETRY_ENDPOINT=otel-collector-opentelemetry-collector.collector.svc.cluster.local:4317 TELEMETRY_ENDPOINT_INSECURE=true
+```
+
+## Step 3 - Update Manifests for a Local Run
+
+For NGINX OSS:
+
+```makefile
+make update-ngf-manifest TAG=$(whoami)
+```
+
+For NGINX Plus:
+
+```makefile
+make update-ngf-manifest-with-plus TAG=$(whoami)
+```
+
+## Step 4 - Run the tests
 
 ### 3a - Run the tests locally
 
@@ -140,7 +165,7 @@ Or, to run the tests with NGINX Plus enabled:
 make test TAG=$(whoami) PLUS_ENABLED=true
 ```
 
-### 3b - Run the tests on a GKE cluster from a GCP VM
+### 4b - Run the tests on a GKE cluster from a GCP VM
 
 This step only applies if you would like to run the tests on a GKE cluster from a GCP based VM.
 
@@ -185,6 +210,14 @@ or to pass a specific flag, e.g. run a specific test, use the GINKGO_FLAGS varia
 make test TAG=$(whoami) GINKGO_FLAGS='-ginkgo.focus "writes the system info to a results file"'
 ```
 
+To run the telemetry test, which requires a specially built image (see above), run:
+
+```makefile
+make test TAG=$(whoami) GINKGO_LABEL=telemetry
+```
+
+Otherwise, the test will be skipped.
+
 If you are running the tests in GCP, add your required label/ flags to `scripts/var.env`.
 
 You can also modify the tests code for a similar outcome. To run a specific test, you can "focus" it by adding the `F`
diff --git a/tests/framework/resourcemanager.go b/tests/framework/resourcemanager.go
@@ -30,27 +30,32 @@ import (
 	"strings"
 	"time"
 
+	apps "k8s.io/api/apps/v1"
 	core "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/apimachinery/pkg/util/yaml"
+	"k8s.io/client-go/kubernetes"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	v1 "sigs.k8s.io/gateway-api/apis/v1"
 )
 
 // ResourceManager handles creating/updating/deleting Kubernetes resources.
 type ResourceManager struct {
-	K8sClient     client.Client
-	FS            embed.FS
-	TimeoutConfig TimeoutConfig
+	K8sClient      client.Client
+	ClientGoClient kubernetes.Interface // used when k8sClient is not enough
+	FS             embed.FS
+	TimeoutConfig  TimeoutConfig
 }
 
 // ClusterInfo holds the cluster metadata
 type ClusterInfo struct {
-	K8sVersion      string
+	K8sVersion string
+	// ID is the UID of kube-system namespace
+	ID              string
 	MemoryPerNode   string
 	GkeInstanceType string
 	GkeZone         string
@@ -406,9 +411,89 @@ func (rm *ResourceManager) GetClusterInfo() (ClusterInfo, error) {
 		ci.GkeZone = node.Labels["topology.kubernetes.io/zone"]
 	}
 
+	var ns core.Namespace
+	key := types.NamespacedName{Name: "kube-system"}
+
+	if err := rm.K8sClient.Get(ctx, key, &ns); err != nil {
+		return *ci, fmt.Errorf("error getting kube-system namespace: %w", err)
+	}
+
+	ci.ID = string(ns.UID)
+
 	return *ci, nil
 }
 
+// GetPodNames returns the names of all Pods in the specified namespace that match the given labels.
+func (rm *ResourceManager) GetPodNames(namespace string, labels client.MatchingLabels) ([]string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), rm.TimeoutConfig.GetTimeout)
+	defer cancel()
+
+	var podList core.PodList
+	if err := rm.K8sClient.List(
+		ctx,
+		&podList,
+		client.InNamespace(namespace),
+		labels,
+	); err != nil {
+		return nil, fmt.Errorf("error getting list of Pods: %w", err)
+	}
+
+	names := make([]string, 0, len(podList.Items))
+
+	for _, pod := range podList.Items {
+		names = append(names, pod.Name)
+	}
+
+	return names, nil
+}
+
+// GetPodLogs returns the logs from the specified Pod
+func (rm *ResourceManager) GetPodLogs(namespace, name string, opts *core.PodLogOptions) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), rm.TimeoutConfig.GetTimeout)
+	defer cancel()
+
+	req := rm.ClientGoClient.CoreV1().Pods(namespace).GetLogs(name, opts)
+
+	logs, err := req.Stream(ctx)
+	if err != nil {
+		return "", fmt.Errorf("error getting logs from Pod: %w", err)
+	}
+	defer logs.Close()
+
+	buf := new(bytes.Buffer)
+	if _, err := buf.ReadFrom(logs); err != nil {
+		return "", fmt.Errorf("error reading logs from Pod: %w", err)
+	}
+
+	return buf.String(), nil
+}
+
+// GetNGFDeployment returns the NGF Deployment in the specified namespace with the given release name.
+func (rm *ResourceManager) GetNGFDeployment(namespace, releaseName string) (*apps.Deployment, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), rm.TimeoutConfig.GetTimeout)
+	defer cancel()
+
+	var deployments apps.DeploymentList
+
+	if err := rm.K8sClient.List(
+		ctx,
+		&deployments,
+		client.InNamespace(namespace),
+		client.MatchingLabels{
+			"app.kubernetes.io/instance": releaseName,
+		},
+	); err != nil {
+		return nil, fmt.Errorf("error getting list of Deployments: %w", err)
+	}
+
+	if len(deployments.Items) != 1 {
+		return nil, fmt.Errorf("expected 1 NGF Deployment, got %d", len(deployments.Items))
+	}
+
+	deployment := deployments.Items[0]
+	return &deployment, nil
+}
+
 // GetReadyNGFPodNames returns the name(s) of the NGF Pod(s).
 func GetReadyNGFPodNames(
 	k8sClient client.Client,
diff --git a/tests/suite/manifests/telemetry/collector-values.yaml b/tests/suite/manifests/telemetry/collector-values.yaml
@@ -0,0 +1,31 @@
+mode: deployment
+replicaCount: 1
+config:
+  exporters:
+    debug:
+      verbosity: detailed
+    logging: {}
+  extensions:
+    health_check: {}
+    memory_ballast:
+      size_in_percentage: 40
+  processors:
+    batch: {}
+    memory_limiter:
+      check_interval: 5s
+      limit_percentage: 80
+      spike_limit_percentage: 25
+  receivers:
+    otlp:
+      protocols:
+        grpc:
+          endpoint: 0.0.0.0:4317
+  service:
+    extensions:
+    - health_check
+    pipelines:
+      traces:
+        exporters:
+        - debug
+        receivers:
+        - otlp
diff --git a/tests/suite/system_suite_test.go b/tests/suite/system_suite_test.go
@@ -21,6 +21,7 @@ import (
 	k8sRuntime "k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/client-go/kubernetes"
 	ctlr "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/log"
@@ -100,11 +101,15 @@ func setup(cfg setupConfig, extraInstallArgs ...string) {
 	k8sClient, err = client.New(k8sConfig, options)
 	Expect(err).ToNot(HaveOccurred())
 
+	clientGoClient, err := kubernetes.NewForConfig(k8sConfig)
+	Expect(err).ToNot(HaveOccurred())
+
 	timeoutConfig = framework.DefaultTimeoutConfig()
 	resourceManager = framework.ResourceManager{
-		K8sClient:     k8sClient,
-		FS:            manifests,
-		TimeoutConfig: timeoutConfig,
+		K8sClient:      k8sClient,
+		ClientGoClient: clientGoClient,
+		FS:             manifests,
+		TimeoutConfig:  timeoutConfig,
 	}
 
 	clusterInfo, err = resourceManager.GetClusterInfo()
@@ -197,22 +202,26 @@ func teardown() {
 	)).To(Succeed())
 }
 
-var _ = BeforeSuite(func() {
+func getDefaultSetupCfg() setupConfig {
 	_, file, _, _ := runtime.Caller(0)
 	fileDir := path.Join(path.Dir(file), "../")
 	basepath := filepath.Dir(fileDir)
 	localChartPath = filepath.Join(basepath, "deploy/helm-chart")
 
-	cfg := setupConfig{
+	return setupConfig{
 		chartPath:    localChartPath,
 		gwAPIVersion: *gatewayAPIVersion,
 		deploy:       true,
 	}
+}
+
+var _ = BeforeSuite(func() {
+	cfg := getDefaultSetupCfg()
 
 	// If we are running the upgrade test only, then skip the initial deployment.
 	// The upgrade test will deploy its own version of NGF.
 	suiteConfig, _ := GinkgoConfiguration()
-	if suiteConfig.LabelFilter == "upgrade" {
+	if suiteConfig.LabelFilter == "upgrade" || suiteConfig.LabelFilter == "telemetry" {
 		cfg.deploy = false
 	}
 
diff --git a/tests/suite/telemetry_test.go b/tests/suite/telemetry_test.go