tools: generate tab level test summary instead of test level.

Signed-off-by: Siyuan Zhang <sizhang@google.com>
etcd-io · May 4, 2024 · edc5f16 · edc5f16
1 parent ce45881
commit edc5f16
Show file tree

Hide file tree

Showing 6 changed files with 284 additions and 68 deletions.
diff --git a/scripts/measure-testgrid-flakiness.sh b/scripts/measure-testgrid-flakiness.sh
@@ -11,11 +11,12 @@ fi
 
 pushd ./tools/testgrid-analysis
 # ci-etcd-e2e-amd64 and ci-etcd-unit-test-amd64 runs 6 times a day. Keeping a rolling window of 14 days.
-go run main.go flaky --create-issue --dashboard=sig-etcd-periodics --tab=ci-etcd-e2e-amd64 --max-days=14
-go run main.go flaky --create-issue --dashboard=sig-etcd-periodics --tab=ci-etcd-unit-test-amd64 --max-days=14
+go run main.go flaky --auto-create-issues --dashboard=sig-etcd-periodics --tab=ci-etcd-e2e-amd64 --max-days=14
+go run main.go flaky --auto-create-issues --dashboard=sig-etcd-periodics --tab=ci-etcd-unit-test-amd64 --max-days=14
 
-# do not create issues for presubmit tests
-go run main.go flaky --dashboard=sig-etcd-presubmits --tab=pull-etcd-e2e-amd64
-go run main.go flaky --dashboard=sig-etcd-presubmits --tab=pull-etcd-unit-test
+go run main.go flaky --auto-create-issues --dashboard=sig-etcd-presubmits --tab=pull-etcd-e2e-amd64 --max-days=14
+go run main.go flaky --auto-create-issues --dashboard=sig-etcd-presubmits --tab=pull-etcd-unit-test --max-days=14
+
+go run main.go auto-close-stale-issues --days-before-auto-close=14
 
 popd
diff --git a/tools/testgrid-analysis/cmd/data.go b/tools/testgrid-analysis/cmd/data.go
@@ -19,6 +19,7 @@ import (
 	"io"
 	"net/http"
 	"os"
+	"sort"
 	"strings"
 	"time"
 
@@ -36,20 +37,41 @@ var (
 	skippedTestStatuses = make(map[int32]struct{})
 )
 
+type TabResultSummary struct {
+	DashboardName, TabName string
+	TestsWithFailures      []*TestResultSummary
+	FailureRate            float32
+	IssueBody              string
+	allBuilds              map[string]struct{}
+	failedBuilds           map[string]struct{}
+}
+
 type TestResultSummary struct {
 	Name                  string
 	FullName              string
 	TotalRuns, FailedRuns int
 	FailureRate           float32
 	FailureLogs           []string
 	IssueBody             string
+	allBuilds             map[string]struct{}
+	failedBuilds          map[string]struct{}
 }
 
-func fetchTestResultSummaries(dashboard, tab string) []*TestResultSummary {
-	// Fetch test data
-	rowsURL := fmt.Sprintf("http://testgrid-data.k8s.io/api/v1/dashboards/%s/tabs/%s/rows", dashboard, tab)
-	headersURL := fmt.Sprintf("http://testgrid-data.k8s.io/api/v1/dashboards/%s/tabs/%s/headers", dashboard, tab)
+func FetchTabResultSummary(dashboard, tab string) *TabResultSummary {
+	summary := TabResultSummary{DashboardName: dashboard, TabName: tab}
+	summary.analyzeTestResults()
+	return &summary
+}
 
+func (tab *TabResultSummary) dataURLs() (rowsURL, headersURL string) {
+	rowsURL = fmt.Sprintf("http://testgrid-data.k8s.io/api/v1/dashboards/%s/tabs/%s/rows", tab.DashboardName, tab.TabName)
+	headersURL = fmt.Sprintf("http://testgrid-data.k8s.io/api/v1/dashboards/%s/tabs/%s/headers", tab.DashboardName, tab.TabName)
+	return
+}
+
+func (tab *TabResultSummary) analyzeTestResults() {
+	// Fetch test data
+	rowsURL, headersURL := tab.dataURLs()
 	var testData apipb.ListRowsResponse
 	var headerData apipb.ListHeadersResponse
 	protojson.Unmarshal(fetchJSON(rowsURL), &testData)
@@ -60,13 +82,48 @@ func fetchTestResultSummaries(dashboard, tab string) []*TestResultSummary {
 		allTests = append(allTests, row.Name)
 	}
 
-	summaries := []*TestResultSummary{}
+	tab.allBuilds = map[string]struct{}{}
+	tab.failedBuilds = map[string]struct{}{}
+
 	// Process rows
 	for _, row := range testData.Rows {
-		t := processRow(dashboard, tab, row, allTests, headerData.Headers)
-		summaries = append(summaries, t)
+		t := processRow(tab.DashboardName, tab.TabName, row, allTests, headerData.Headers)
+		mergeMaps(t.allBuilds, tab.allBuilds)
+		mergeMaps(t.failedBuilds, tab.failedBuilds)
+		if t.FailedRuns > 0 {
+			tab.TestsWithFailures = append(tab.TestsWithFailures, t)
+		}
+	}
+	sort.Slice(tab.TestsWithFailures, func(i, j int) bool {
+		ti := tab.TestsWithFailures[i]
+		tj := tab.TestsWithFailures[j]
+		if ti.FailureRate == tj.FailureRate {
+			if ti.FailedRuns == tj.FailedRuns {
+				return ti.FullName < tj.FullName
+			}
+			return ti.FailedRuns > tj.FailedRuns
+		}
+		return ti.FailureRate > tj.FailureRate
+	})
+	if len(tab.allBuilds) > 0 {
+		tab.FailureRate = float32(len(tab.failedBuilds)) / float32(len(tab.allBuilds))
+	}
+	tab.IssueBody += fmt.Sprintf("%s#%s failed %.1f%% (%d/%d) of the time\n", tab.DashboardName, tab.TabName,
+		100*tab.FailureRate, len(tab.failedBuilds), len(tab.allBuilds))
+	if len(tab.failedBuilds) > 0 {
+		tab.IssueBody += "<details>\n<summary><b>Recent failed test logs</b></summary>\n"
+		for _, header := range headerData.Headers {
+			if _, found := tab.failedBuilds[header.Build]; found {
+				tab.IssueBody += fmt.Sprintf("\n* %s", buildLogURL(tab.TabName, header))
+			}
+		}
+		tab.IssueBody += "\n</details>\n<details>\n<summary><b>Failed tests</b></summary>\n"
+		for _, t := range tab.TestsWithFailures {
+			tab.IssueBody += fmt.Sprintf("\n* %s failed %.1f%% (%d/%d) of the time", t.FullName, t.FailureRate*100, t.FailedRuns, t.TotalRuns)
+		}
+		tab.IssueBody += "\n</details>\n"
 	}
-	return summaries
+	fmt.Println(tab.IssueBody)
 }
 
 func processRow(dashboard, tab string, row *apipb.ListRowsResponse_Row, allTests []string, headers []*apipb.ListHeadersResponse_Header) *TestResultSummary {
@@ -81,6 +138,8 @@ func processRow(dashboard, tab string, row *apipb.ListRowsResponse_Row, allTests
 	earliestTimeToConsider := time.Now().AddDate(0, 0, -1*maxDays)
 	total := 0
 	failed := 0
+	allBuilds := map[string]struct{}{}
+	failedBuilds := map[string]struct{}{}
 	logs := []string{}
 	for i, cell := range row.Cells {
 		// ignore tests with status not in the validTestStatuses
@@ -96,10 +155,12 @@ func processRow(dashboard, tab string, row *apipb.ListRowsResponse_Row, allTests
 			continue
 		}
 		total++
+		allBuilds[header.Build] = struct{}{}
 		if _, ok := failureTestStatusesInt[cell.Result]; ok {
 			failed++
+			failedBuilds[header.Build] = struct{}{}
 			// markdown table format of | commit | log |
-			logs = append(logs, fmt.Sprintf("| %s | %s | https://prow.k8s.io/view/gs/kubernetes-jenkins/logs/%s/%s |", strings.Join(header.Extra, ","), header.Started.AsTime().String(), tab, header.Build))
+			logs = append(logs, fmt.Sprintf("| %s | %s | %s |", strings.Join(header.Extra, ","), header.Started.AsTime().String(), buildLogURL(tab, header)))
 		}
 		if maxRuns > 0 && total >= maxRuns {
 			break
@@ -109,12 +170,12 @@ func processRow(dashboard, tab string, row *apipb.ListRowsResponse_Row, allTests
 	t.TotalRuns = total
 	t.FailureLogs = logs
 	t.FailureRate = float32(failed) / float32(total)
+	t.failedBuilds = failedBuilds
+	t.allBuilds = allBuilds
 	if t.FailedRuns > 0 {
-		dashboardUrl := fmt.Sprintf("[%s](https://testgrid.k8s.io/%s#%s)", tab, dashboard, tab)
-		t.IssueBody = fmt.Sprintf("## %s Test: %s \nTest failed %.1f%% (%d/%d) of the time\n\nfailure logs are:\n| commit | started | log |\n| --- | --- | --- |\n%s\n",
-			dashboardUrl, t.FullName, t.FailureRate*100, t.FailedRuns, t.TotalRuns, strings.Join(t.FailureLogs, "\n"))
-		t.IssueBody += "\nPlease follow the [instructions in the contributing guide](https://github.com/etcd-io/etcd/blob/main/CONTRIBUTING.md#check-for-flaky-tests) to reproduce the issue.\n"
-		fmt.Printf("%s failed %.1f%% (%d/%d) of the time\n", t.FullName, t.FailureRate*100, t.FailedRuns, t.TotalRuns)
+		t.IssueBody = fmt.Sprintf("## %s Test: %s \nTest failed %.1f%% (%d/%d) of the time\n\n<details>\n<summary><b>failure logs:</b></summary>\n\n| commit | started | log |\n| --- | --- | --- |\n%s\n",
+			dashboardTabURL(dashboard, tab), t.FullName, t.FailureRate*100, t.FailedRuns, t.TotalRuns, strings.Join(t.FailureLogs, "\n"))
+		t.IssueBody += "\n</details>\n\nPlease follow the [instructions in the contributing guide](https://github.com/etcd-io/etcd/blob/main/CONTRIBUTING.md#check-for-flaky-tests) to reproduce the issue.\n"
 	}
 	return &t
 }
@@ -150,6 +211,29 @@ func intStatusSet(statuses []statuspb.TestStatus) map[int32]struct{} {
 }
 
 func shortenTestName(fullname string) string {
-	parts := strings.Split(fullname, ".")
-	return parts[len(parts)-1]
+	parts := strings.Split(fullname, "/")
+	keepParts := []string{}
+	// keep the package name of the test.
+	for i := len(parts) - 1; i >= 0; i-- {
+		part := parts[i]
+		keepParts = append([]string{part}, keepParts...)
+		if strings.Contains(part, ".") {
+			break
+		}
+	}
+	return strings.Join(keepParts, "/")
+}
+
+func mergeMaps(from, to map[string]struct{}) {
+	for k, v := range from {
+		to[k] = v
+	}
+}
+
+func dashboardTabURL(dashboard, tab string) string {
+	return fmt.Sprintf("[%s](https://testgrid.k8s.io/%s#%s)", tab, dashboard, tab)
+}
+
+func buildLogURL(tab string, header *apipb.ListHeadersResponse_Header) string {
+	return fmt.Sprintf("https://prow.k8s.io/view/gs/kubernetes-jenkins/logs/%s/%s", tab, header.Build)
 }
diff --git a/tools/testgrid-analysis/cmd/data_test.go b/tools/testgrid-analysis/cmd/data_test.go
@@ -0,0 +1,51 @@
+// Copyright 2024 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"testing"
+)
+
+func TestShortenTestName(t *testing.T) {
+	tests := []struct {
+		testName  string
+		shortName string
+	}{
+		{
+			testName:  "go.etcd.io/etcd/tests/v3/common.TestKVGet/ClientTLS",
+			shortName: "common.TestKVGet/ClientTLS",
+		},
+		{
+			testName:  "go.etcd.io/etcd/tests/v3/common.TestKVDelete/ClientTLS",
+			shortName: "common.TestKVDelete/ClientTLS",
+		},
+		{
+			testName:  "go.etcd.io/etcd/tests/v3/common.TestLeaseGrantAndList/ClientAutoTLS/many_leases",
+			shortName: "common.TestLeaseGrantAndList/ClientAutoTLS/many_leases",
+		},
+		{
+			testName:  "go.etcd.io/etcd/tests/v3/common.TestMoveLeaderWithInvalidAuth",
+			shortName: "common.TestMoveLeaderWithInvalidAuth",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.testName, func(t *testing.T) {
+			shortName := shortenTestName(tt.testName)
+			if shortName != tt.shortName {
+				t.Errorf("Want %s, got %s", tt.shortName, shortName)
+			}
+		})
+	}
+}
diff --git a/tools/testgrid-analysis/cmd/flaky.go b/tools/testgrid-analysis/cmd/flaky.go
@@ -28,52 +28,59 @@ var flakyCmd = &cobra.Command{
 	Run:   flakyFunc,
 }
 
+var closeStaleIssuesCmd = &cobra.Command{
+	Use:   "auto-close-stale-issues",
+	Short: "auto close stale flaky test issues",
+	Long:  `automatically close stale Github issues for flaky test.`,
+	Run:   closeStaleIssuesFunc,
+}
+
 var (
-	flakyThreshold    float32
-	minRuns           int
-	maxRuns           int
-	maxDays           int
-	createGithubIssue bool
-	githubOwner       string
-	githubRepo        string
+	flakyThreshold         float32
+	maxSubIssuesForTestSet int
+	minRuns                int
+	maxRuns                int
+	maxDays                int
+	autoCreateIssues       bool
+	daysBeforeAutoClose    int
 
 	lineSep = "-------------------------------------------------------------"
 )
 
 func init() {
 	rootCmd.AddCommand(flakyCmd)
+	rootCmd.AddCommand(closeStaleIssuesCmd)
 
-	flakyCmd.Flags().BoolVar(&createGithubIssue, "create-issue", false, "create Github issue for each flaky test")
 	flakyCmd.Flags().Float32Var(&flakyThreshold, "flaky-threshold", 0.1, "fraction threshold of test failures for a test to be considered flaky")
-	flakyCmd.Flags().IntVar(&minRuns, "min-runs", 20, "minimum test runs for a test to be included in flaky analysis")
+	flakyCmd.Flags().IntVar(&minRuns, "min-runs", 20, "minimum test runs for a test to be created an issue for")
 	flakyCmd.Flags().IntVar(&maxRuns, "max-runs", 0, "maximum test runs for a test to be included in flaky analysis, 0 to include all")
-	flakyCmd.Flags().IntVar(&maxDays, "max-days", 0, "maximum days of results before today to be included in flaky analysis, 0 to include all")
-	flakyCmd.Flags().StringVar(&githubOwner, "github-owner", "etcd-io", "the github organization to create the issue for")
-	flakyCmd.Flags().StringVar(&githubRepo, "github-repo", "etcd", "the github repo to create the issue for")
+	flakyCmd.Flags().IntVar(&maxDays, "max-days", 30, "maximum days of results before today to be included in flaky analysis, 0 to include all")
+	flakyCmd.Flags().BoolVar(&autoCreateIssues, "auto-create-issues", false, "automatically create Github issue for flaky test")
+	flakyCmd.Flags().IntVar(&maxSubIssuesForTestSet, "max-sub-issues", 3, "maximum number of sub-issues to create for a test set")
+
+	closeStaleIssuesCmd.Flags().IntVar(&daysBeforeAutoClose, "days-before-auto-close", 30, "maximum days of no updates before an issue is automatically closed")
 }
 
 func flakyFunc(cmd *cobra.Command, args []string) {
-	fmt.Printf("flaky called, for %s#%s, createGithubIssue=%v, githubRepo=%s/%s, flakyThreshold=%f, minRuns=%d\n", dashboard, tab, createGithubIssue, githubOwner, githubRepo, flakyThreshold, minRuns)
-
-	allTests := fetchTestResultSummaries(dashboard, tab)
-	flakyTests := []*TestResultSummary{}
-	for _, t := range allTests {
-		if t.TotalRuns >= minRuns && t.FailureRate >= flakyThreshold {
-			flakyTests = append(flakyTests, t)
-		}
-	}
 	fmt.Println(lineSep)
-	fmt.Printf("Detected total %d flaky tests above the %.0f%% threshold for %s#%s\n", len(flakyTests), flakyThreshold*100, dashboard, tab)
+	fmt.Printf("flaky called, for %s#%s, createGithubIssue=%v, githubRepo=%s/%s, flakyThreshold=%f, minRuns=%d\n", dashboard, tab, autoCreateIssues, githubOwner, githubRepo, flakyThreshold, minRuns)
+
+	tabSummary := FetchTabResultSummary(dashboard, tab)
 	fmt.Println(lineSep)
-	if len(flakyTests) == 0 {
+	if tabSummary.FailureRate < flakyThreshold {
+		fmt.Printf("Failure rate for test set %s#%s is %.1f%%, below the flaky threshold %.0f%%\n", dashboard, tab, tabSummary.FailureRate*100, flakyThreshold*100)
 		return
 	}
-	for _, t := range flakyTests {
-		fmt.Println(lineSep)
-		fmt.Println(t.IssueBody)
-		fmt.Println(lineSep)
-	}
-	if createGithubIssue {
-		createIssues(flakyTests, []string{"type/flake"})
+	fmt.Printf("Failure rate for test set %s#%s is %.1f%%, above the flaky threshold %.0f%%\n", dashboard, tab, tabSummary.FailureRate*100, flakyThreshold*100)
+	if autoCreateIssues {
+		createIssues(tabSummary, minRuns, maxSubIssuesForTestSet, []string{"type/flake"})
 	}
+	fmt.Println(lineSep)
+}
+
+func closeStaleIssuesFunc(cmd *cobra.Command, args []string) {
+	fmt.Println(lineSep)
+	fmt.Printf("auto close stale issues with no updates for %d days in githubRepo=%s/%s\n", daysBeforeAutoClose, githubOwner, githubRepo)
+	closeStaleIssues(daysBeforeAutoClose, []string{"type/flake"})
+	fmt.Println(lineSep)
 }