Skip to content

Commit

Permalink
tools: generate tab level test summary instead of test level.
Browse files Browse the repository at this point in the history
Signed-off-by: Siyuan Zhang <sizhang@google.com>
  • Loading branch information
siyuanfoundation committed May 4, 2024
1 parent ce45881 commit edc5f16
Show file tree
Hide file tree
Showing 6 changed files with 284 additions and 68 deletions.
11 changes: 6 additions & 5 deletions scripts/measure-testgrid-flakiness.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@ fi

pushd ./tools/testgrid-analysis
# ci-etcd-e2e-amd64 and ci-etcd-unit-test-amd64 runs 6 times a day. Keeping a rolling window of 14 days.
go run main.go flaky --create-issue --dashboard=sig-etcd-periodics --tab=ci-etcd-e2e-amd64 --max-days=14
go run main.go flaky --create-issue --dashboard=sig-etcd-periodics --tab=ci-etcd-unit-test-amd64 --max-days=14
go run main.go flaky --auto-create-issues --dashboard=sig-etcd-periodics --tab=ci-etcd-e2e-amd64 --max-days=14
go run main.go flaky --auto-create-issues --dashboard=sig-etcd-periodics --tab=ci-etcd-unit-test-amd64 --max-days=14

# do not create issues for presubmit tests
go run main.go flaky --dashboard=sig-etcd-presubmits --tab=pull-etcd-e2e-amd64
go run main.go flaky --dashboard=sig-etcd-presubmits --tab=pull-etcd-unit-test
go run main.go flaky --auto-create-issues --dashboard=sig-etcd-presubmits --tab=pull-etcd-e2e-amd64 --max-days=14
go run main.go flaky --auto-create-issues --dashboard=sig-etcd-presubmits --tab=pull-etcd-unit-test --max-days=14

go run main.go auto-close-stale-issues --days-before-auto-close=14

popd
116 changes: 100 additions & 16 deletions tools/testgrid-analysis/cmd/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"io"
"net/http"
"os"
"sort"
"strings"
"time"

Expand All @@ -36,20 +37,41 @@ var (
skippedTestStatuses = make(map[int32]struct{})
)

type TabResultSummary struct {
DashboardName, TabName string
TestsWithFailures []*TestResultSummary
FailureRate float32
IssueBody string
allBuilds map[string]struct{}
failedBuilds map[string]struct{}
}

type TestResultSummary struct {
Name string
FullName string
TotalRuns, FailedRuns int
FailureRate float32
FailureLogs []string
IssueBody string
allBuilds map[string]struct{}
failedBuilds map[string]struct{}
}

func fetchTestResultSummaries(dashboard, tab string) []*TestResultSummary {
// Fetch test data
rowsURL := fmt.Sprintf("http://testgrid-data.k8s.io/api/v1/dashboards/%s/tabs/%s/rows", dashboard, tab)
headersURL := fmt.Sprintf("http://testgrid-data.k8s.io/api/v1/dashboards/%s/tabs/%s/headers", dashboard, tab)
func FetchTabResultSummary(dashboard, tab string) *TabResultSummary {
summary := TabResultSummary{DashboardName: dashboard, TabName: tab}
summary.analyzeTestResults()
return &summary
}

func (tab *TabResultSummary) dataURLs() (rowsURL, headersURL string) {
rowsURL = fmt.Sprintf("http://testgrid-data.k8s.io/api/v1/dashboards/%s/tabs/%s/rows", tab.DashboardName, tab.TabName)
headersURL = fmt.Sprintf("http://testgrid-data.k8s.io/api/v1/dashboards/%s/tabs/%s/headers", tab.DashboardName, tab.TabName)
return
}

func (tab *TabResultSummary) analyzeTestResults() {
// Fetch test data
rowsURL, headersURL := tab.dataURLs()
var testData apipb.ListRowsResponse
var headerData apipb.ListHeadersResponse
protojson.Unmarshal(fetchJSON(rowsURL), &testData)
Expand All @@ -60,13 +82,48 @@ func fetchTestResultSummaries(dashboard, tab string) []*TestResultSummary {
allTests = append(allTests, row.Name)
}

summaries := []*TestResultSummary{}
tab.allBuilds = map[string]struct{}{}
tab.failedBuilds = map[string]struct{}{}

// Process rows
for _, row := range testData.Rows {
t := processRow(dashboard, tab, row, allTests, headerData.Headers)
summaries = append(summaries, t)
t := processRow(tab.DashboardName, tab.TabName, row, allTests, headerData.Headers)
mergeMaps(t.allBuilds, tab.allBuilds)
mergeMaps(t.failedBuilds, tab.failedBuilds)
if t.FailedRuns > 0 {
tab.TestsWithFailures = append(tab.TestsWithFailures, t)
}
}
sort.Slice(tab.TestsWithFailures, func(i, j int) bool {
ti := tab.TestsWithFailures[i]
tj := tab.TestsWithFailures[j]
if ti.FailureRate == tj.FailureRate {
if ti.FailedRuns == tj.FailedRuns {
return ti.FullName < tj.FullName
}
return ti.FailedRuns > tj.FailedRuns
}
return ti.FailureRate > tj.FailureRate
})
if len(tab.allBuilds) > 0 {
tab.FailureRate = float32(len(tab.failedBuilds)) / float32(len(tab.allBuilds))
}
tab.IssueBody += fmt.Sprintf("%s#%s failed %.1f%% (%d/%d) of the time\n", tab.DashboardName, tab.TabName,
100*tab.FailureRate, len(tab.failedBuilds), len(tab.allBuilds))
if len(tab.failedBuilds) > 0 {
tab.IssueBody += "<details>\n<summary><b>Recent failed test logs</b></summary>\n"
for _, header := range headerData.Headers {
if _, found := tab.failedBuilds[header.Build]; found {
tab.IssueBody += fmt.Sprintf("\n* %s", buildLogURL(tab.TabName, header))
}
}
tab.IssueBody += "\n</details>\n<details>\n<summary><b>Failed tests</b></summary>\n"
for _, t := range tab.TestsWithFailures {
tab.IssueBody += fmt.Sprintf("\n* %s failed %.1f%% (%d/%d) of the time", t.FullName, t.FailureRate*100, t.FailedRuns, t.TotalRuns)
}
tab.IssueBody += "\n</details>\n"
}
return summaries
fmt.Println(tab.IssueBody)
}

func processRow(dashboard, tab string, row *apipb.ListRowsResponse_Row, allTests []string, headers []*apipb.ListHeadersResponse_Header) *TestResultSummary {
Expand All @@ -81,6 +138,8 @@ func processRow(dashboard, tab string, row *apipb.ListRowsResponse_Row, allTests
earliestTimeToConsider := time.Now().AddDate(0, 0, -1*maxDays)
total := 0
failed := 0
allBuilds := map[string]struct{}{}
failedBuilds := map[string]struct{}{}
logs := []string{}
for i, cell := range row.Cells {
// ignore tests with status not in the validTestStatuses
Expand All @@ -96,10 +155,12 @@ func processRow(dashboard, tab string, row *apipb.ListRowsResponse_Row, allTests
continue
}
total++
allBuilds[header.Build] = struct{}{}
if _, ok := failureTestStatusesInt[cell.Result]; ok {
failed++
failedBuilds[header.Build] = struct{}{}
// markdown table format of | commit | log |
logs = append(logs, fmt.Sprintf("| %s | %s | https://prow.k8s.io/view/gs/kubernetes-jenkins/logs/%s/%s |", strings.Join(header.Extra, ","), header.Started.AsTime().String(), tab, header.Build))
logs = append(logs, fmt.Sprintf("| %s | %s | %s |", strings.Join(header.Extra, ","), header.Started.AsTime().String(), buildLogURL(tab, header)))
}
if maxRuns > 0 && total >= maxRuns {
break
Expand All @@ -109,12 +170,12 @@ func processRow(dashboard, tab string, row *apipb.ListRowsResponse_Row, allTests
t.TotalRuns = total
t.FailureLogs = logs
t.FailureRate = float32(failed) / float32(total)
t.failedBuilds = failedBuilds
t.allBuilds = allBuilds
if t.FailedRuns > 0 {
dashboardUrl := fmt.Sprintf("[%s](https://testgrid.k8s.io/%s#%s)", tab, dashboard, tab)
t.IssueBody = fmt.Sprintf("## %s Test: %s \nTest failed %.1f%% (%d/%d) of the time\n\nfailure logs are:\n| commit | started | log |\n| --- | --- | --- |\n%s\n",
dashboardUrl, t.FullName, t.FailureRate*100, t.FailedRuns, t.TotalRuns, strings.Join(t.FailureLogs, "\n"))
t.IssueBody += "\nPlease follow the [instructions in the contributing guide](https://github.com/etcd-io/etcd/blob/main/CONTRIBUTING.md#check-for-flaky-tests) to reproduce the issue.\n"
fmt.Printf("%s failed %.1f%% (%d/%d) of the time\n", t.FullName, t.FailureRate*100, t.FailedRuns, t.TotalRuns)
t.IssueBody = fmt.Sprintf("## %s Test: %s \nTest failed %.1f%% (%d/%d) of the time\n\n<details>\n<summary><b>failure logs:</b></summary>\n\n| commit | started | log |\n| --- | --- | --- |\n%s\n",
dashboardTabURL(dashboard, tab), t.FullName, t.FailureRate*100, t.FailedRuns, t.TotalRuns, strings.Join(t.FailureLogs, "\n"))
t.IssueBody += "\n</details>\n\nPlease follow the [instructions in the contributing guide](https://github.com/etcd-io/etcd/blob/main/CONTRIBUTING.md#check-for-flaky-tests) to reproduce the issue.\n"
}
return &t
}
Expand Down Expand Up @@ -150,6 +211,29 @@ func intStatusSet(statuses []statuspb.TestStatus) map[int32]struct{} {
}

func shortenTestName(fullname string) string {
parts := strings.Split(fullname, ".")
return parts[len(parts)-1]
parts := strings.Split(fullname, "/")
keepParts := []string{}
// keep the package name of the test.
for i := len(parts) - 1; i >= 0; i-- {
part := parts[i]
keepParts = append([]string{part}, keepParts...)
if strings.Contains(part, ".") {
break
}
}
return strings.Join(keepParts, "/")
}

func mergeMaps(from, to map[string]struct{}) {
for k, v := range from {
to[k] = v
}
}

func dashboardTabURL(dashboard, tab string) string {
return fmt.Sprintf("[%s](https://testgrid.k8s.io/%s#%s)", tab, dashboard, tab)
}

func buildLogURL(tab string, header *apipb.ListHeadersResponse_Header) string {
return fmt.Sprintf("https://prow.k8s.io/view/gs/kubernetes-jenkins/logs/%s/%s", tab, header.Build)
}
51 changes: 51 additions & 0 deletions tools/testgrid-analysis/cmd/data_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright 2024 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cmd

import (
"testing"
)

func TestShortenTestName(t *testing.T) {
tests := []struct {
testName string
shortName string
}{
{
testName: "go.etcd.io/etcd/tests/v3/common.TestKVGet/ClientTLS",
shortName: "common.TestKVGet/ClientTLS",
},
{
testName: "go.etcd.io/etcd/tests/v3/common.TestKVDelete/ClientTLS",
shortName: "common.TestKVDelete/ClientTLS",
},
{
testName: "go.etcd.io/etcd/tests/v3/common.TestLeaseGrantAndList/ClientAutoTLS/many_leases",
shortName: "common.TestLeaseGrantAndList/ClientAutoTLS/many_leases",
},
{
testName: "go.etcd.io/etcd/tests/v3/common.TestMoveLeaderWithInvalidAuth",
shortName: "common.TestMoveLeaderWithInvalidAuth",
},
}
for _, tt := range tests {
t.Run(tt.testName, func(t *testing.T) {
shortName := shortenTestName(tt.testName)
if shortName != tt.shortName {
t.Errorf("Want %s, got %s", tt.shortName, shortName)
}
})
}
}
67 changes: 37 additions & 30 deletions tools/testgrid-analysis/cmd/flaky.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,52 +28,59 @@ var flakyCmd = &cobra.Command{
Run: flakyFunc,
}

var closeStaleIssuesCmd = &cobra.Command{
Use: "auto-close-stale-issues",
Short: "auto close stale flaky test issues",
Long: `automatically close stale Github issues for flaky test.`,
Run: closeStaleIssuesFunc,
}

var (
flakyThreshold float32
minRuns int
maxRuns int
maxDays int
createGithubIssue bool
githubOwner string
githubRepo string
flakyThreshold float32
maxSubIssuesForTestSet int
minRuns int
maxRuns int
maxDays int
autoCreateIssues bool
daysBeforeAutoClose int

lineSep = "-------------------------------------------------------------"
)

func init() {
rootCmd.AddCommand(flakyCmd)
rootCmd.AddCommand(closeStaleIssuesCmd)

flakyCmd.Flags().BoolVar(&createGithubIssue, "create-issue", false, "create Github issue for each flaky test")
flakyCmd.Flags().Float32Var(&flakyThreshold, "flaky-threshold", 0.1, "fraction threshold of test failures for a test to be considered flaky")
flakyCmd.Flags().IntVar(&minRuns, "min-runs", 20, "minimum test runs for a test to be included in flaky analysis")
flakyCmd.Flags().IntVar(&minRuns, "min-runs", 20, "minimum test runs for a test to be created an issue for")
flakyCmd.Flags().IntVar(&maxRuns, "max-runs", 0, "maximum test runs for a test to be included in flaky analysis, 0 to include all")
flakyCmd.Flags().IntVar(&maxDays, "max-days", 0, "maximum days of results before today to be included in flaky analysis, 0 to include all")
flakyCmd.Flags().StringVar(&githubOwner, "github-owner", "etcd-io", "the github organization to create the issue for")
flakyCmd.Flags().StringVar(&githubRepo, "github-repo", "etcd", "the github repo to create the issue for")
flakyCmd.Flags().IntVar(&maxDays, "max-days", 30, "maximum days of results before today to be included in flaky analysis, 0 to include all")
flakyCmd.Flags().BoolVar(&autoCreateIssues, "auto-create-issues", false, "automatically create Github issue for flaky test")
flakyCmd.Flags().IntVar(&maxSubIssuesForTestSet, "max-sub-issues", 3, "maximum number of sub-issues to create for a test set")

closeStaleIssuesCmd.Flags().IntVar(&daysBeforeAutoClose, "days-before-auto-close", 30, "maximum days of no updates before an issue is automatically closed")
}

func flakyFunc(cmd *cobra.Command, args []string) {
fmt.Printf("flaky called, for %s#%s, createGithubIssue=%v, githubRepo=%s/%s, flakyThreshold=%f, minRuns=%d\n", dashboard, tab, createGithubIssue, githubOwner, githubRepo, flakyThreshold, minRuns)

allTests := fetchTestResultSummaries(dashboard, tab)
flakyTests := []*TestResultSummary{}
for _, t := range allTests {
if t.TotalRuns >= minRuns && t.FailureRate >= flakyThreshold {
flakyTests = append(flakyTests, t)
}
}
fmt.Println(lineSep)
fmt.Printf("Detected total %d flaky tests above the %.0f%% threshold for %s#%s\n", len(flakyTests), flakyThreshold*100, dashboard, tab)
fmt.Printf("flaky called, for %s#%s, createGithubIssue=%v, githubRepo=%s/%s, flakyThreshold=%f, minRuns=%d\n", dashboard, tab, autoCreateIssues, githubOwner, githubRepo, flakyThreshold, minRuns)

tabSummary := FetchTabResultSummary(dashboard, tab)
fmt.Println(lineSep)
if len(flakyTests) == 0 {
if tabSummary.FailureRate < flakyThreshold {
fmt.Printf("Failure rate for test set %s#%s is %.1f%%, below the flaky threshold %.0f%%\n", dashboard, tab, tabSummary.FailureRate*100, flakyThreshold*100)
return
}
for _, t := range flakyTests {
fmt.Println(lineSep)
fmt.Println(t.IssueBody)
fmt.Println(lineSep)
}
if createGithubIssue {
createIssues(flakyTests, []string{"type/flake"})
fmt.Printf("Failure rate for test set %s#%s is %.1f%%, above the flaky threshold %.0f%%\n", dashboard, tab, tabSummary.FailureRate*100, flakyThreshold*100)
if autoCreateIssues {
createIssues(tabSummary, minRuns, maxSubIssuesForTestSet, []string{"type/flake"})
}
fmt.Println(lineSep)
}

func closeStaleIssuesFunc(cmd *cobra.Command, args []string) {
fmt.Println(lineSep)
fmt.Printf("auto close stale issues with no updates for %d days in githubRepo=%s/%s\n", daysBeforeAutoClose, githubOwner, githubRepo)
closeStaleIssues(daysBeforeAutoClose, []string{"type/flake"})
fmt.Println(lineSep)
}
Loading

0 comments on commit edc5f16

Please sign in to comment.