symflower · zimmski · May 16, 2024 · May 13, 2024 · May 16, 2024
diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -46,6 +46,8 @@ type Evaluate struct {
 	Repositories []string `long:"repository" description:"Evaluate with this repository. By default all repositories are used."`
 	// ResultPath holds the directory path where results should be written to.
 	ResultPath string `long:"result-path" description:"Directory path where results should be written to. The placeholder \"%datetime%\" can be used for the current date and time." default:"evaluation-%datetime%"`
+	// Runs holds the number of runs to perform.
+	Runs uint `long:"runs" description:"Number of runs to perform." default:"1"`
 	// TestdataPath determines the testdata path where all repositories reside grouped by languages.
 	TestdataPath string `long:"testdata" description:"Path to the testdata directory where all repositories reside grouped by languages." default:"testdata/"`
 
@@ -97,6 +99,10 @@ func (command *Evaluate) Execute(args []string) (err error) {
 		if command.SymflowerBinaryPath != "" {
 			tools.SymflowerPath = command.SymflowerBinaryPath
 		}
+
+		if command.Runs == 0 {
+			log.Panicf("number of configured runs is 0")
+		}
 	}
 
 	// Gather languages.
@@ -229,60 +235,72 @@ func (command *Evaluate) Execute(args []string) (err error) {
 	assessments := report.NewAssessmentPerModelPerLanguagePerRepository(maps.Values(modelsSelected), maps.Values(languagesSelected), command.Repositories)
 	problemsPerModel := map[string][]error{}
 	{
-		for _, languageID := range command.Languages {
-			for _, modelID := range command.Models {
-				model := modelsSelected[modelID]
-				language := languagesSelected[languageID]
+		for r := uint(0); r < command.Runs; r++ {
+			if command.Runs > 1 {
+				log.Printf("Run %d/%d", r+1, command.Runs)
+			}
 
-				repositoryPath := filepath.Join(languageID, repositoryPlainName)
+			for _, languageID := range command.Languages {
+				for _, modelID := range command.Models {
+					model := modelsSelected[modelID]
+					language := languagesSelected[languageID]
 
-				assessment, ps, err := evaluate.Repository(command.logger, command.ResultPath, model, language, command.TestdataPath, repositoryPath)
-				assessments[model][language][repositoryPath].Add(assessment)
-				if err != nil {
-					ps = append(ps, err)
-				}
-				if len(ps) > 0 {
-					log.Printf("Excluding model %q since it was not able to solve the %q repository for language %q: %+v", modelID, repositoryPath, languageID, ps)
-					problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
+					repositoryPath := filepath.Join(languageID, repositoryPlainName)
+
+					assessment, ps, err := evaluate.Repository(command.logger, command.ResultPath, model, language, command.TestdataPath, repositoryPath)
+					assessments[model][language][repositoryPath].Add(assessment)
+					if err != nil {
+						ps = append(ps, err)
+					}
+					if len(ps) > 0 {
+						log.Printf("Excluding model %q since it was not able to solve the %q repository for language %q: %+v", modelID, repositoryPath, languageID, ps)
+						problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
+					}
 				}
 			}
 		}
 	}
 
 	// Evaluating models and languages.
 	log.Printf("Evaluating models and languages")
-	for _, languageID := range command.Languages {
-		languagePath := filepath.Join(command.TestdataPath, languageID)
-		repositories, err := os.ReadDir(languagePath)
-		if err != nil {
-			log.Panicf("ERROR: language path %q cannot be accessed: %s", languagePath, err)
+	for r := uint(0); r < command.Runs; r++ {
+		if command.Runs > 1 {
+			log.Printf("Run %d/%d", r+1, command.Runs)
 		}
 
-		for _, repository := range repositories {
-			repositoryPath := filepath.Join(languageID, repository.Name())
-
-			if !repository.IsDir() || (len(commandRepositories) > 0 && !commandRepositories[repositoryPath]) {
-				continue
+		for _, languageID := range command.Languages {
+			languagePath := filepath.Join(command.TestdataPath, languageID)
+			repositories, err := os.ReadDir(languagePath)
+			if err != nil {
+				log.Panicf("ERROR: language path %q cannot be accessed: %s", languagePath, err)
 			}
 
-			// Do not include "plain" repositories in this step of the evaluation, because they have been checked with the common check before.
-			if repository.Name() == repositoryPlainName {
-				continue
-			}
+			for _, repository := range repositories {
+				repositoryPath := filepath.Join(languageID, repository.Name())
 
-			for _, modelID := range command.Models {
-				if len(problemsPerModel[modelID]) > 0 {
+				if !repository.IsDir() || (len(commandRepositories) > 0 && !commandRepositories[repositoryPath]) {
+					continue
+				}
+
+				// Do not include "plain" repositories in this step of the evaluation, because they have been checked with the common check before.
+				if repository.Name() == repositoryPlainName {
 					continue
 				}
 
-				model := modelsSelected[modelID]
-				language := languagesSelected[languageID]
+				for _, modelID := range command.Models {
+					if len(problemsPerModel[modelID]) > 0 {
+						continue
+					}
 
-				assessment, ps, err := evaluate.Repository(command.logger, command.ResultPath, model, language, command.TestdataPath, repositoryPath)
-				assessments[model][language][repositoryPath].Add(assessment)
-				problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
-				if err != nil {
-					log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err)
+					model := modelsSelected[modelID]
+					language := languagesSelected[languageID]
+
+					assessment, ps, err := evaluate.Repository(command.logger, command.ResultPath, model, language, command.TestdataPath, repositoryPath)
+					assessments[model][language][repositoryPath].Add(assessment)
+					problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
+					if err != nil {
+						log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err)
+					}
 				}
 			}
 		}
@@ -299,7 +317,7 @@ func (command *Evaluate) Execute(args []string) (err error) {
 		}
 	}
 	if isOnlyPlainRepositories {
-		totalScore = uint(len(languagesSelected))
+		totalScore = uint(len(languagesSelected)) * command.Runs
 	}
 
 	assessmentsPerModel := assessments.CollapseByModel()

diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go
@@ -198,9 +198,7 @@ func TestEvaluateExecute(t *testing.T) {
 					},
 				}, []uint{14})
 				assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint(0))
-				if !assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) {
-					t.Logf("Output: %s", output)
-				}
+				assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
 			},
 			ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
 				"categories.svg": func(t *testing.T, filePath, data string) {
@@ -267,9 +265,7 @@ func TestEvaluateExecute(t *testing.T) {
 					},
 				}, []uint{28})
 				assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint(0))
-				if !assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) {
-					t.Logf("Output: %s", output)
-				}
+				assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
 			},
 			ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
 				"categories.svg": func(t *testing.T, filePath, data string) {
@@ -363,9 +359,7 @@ func TestEvaluateExecute(t *testing.T) {
 						},
 					}, []uint{14})
 					assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint(0))
-					if !assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) {
-						t.Logf("Output: %s", output)
-					}
+					assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
 				},
 				ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
 					"categories.svg": func(t *testing.T, filePath, data string) {
@@ -424,9 +418,7 @@ func TestEvaluateExecute(t *testing.T) {
 
 				ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) {
 					assert.Regexp(t, `Evaluation score for "symflower/symbolic-execution" \("code-no-excess"\): score=14, coverage-statement=10, files-executed=1, processing-time=\d+, response-no-error=1, response-no-excess=1, response-with-code=1`, output)
-					if !assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) {
-						t.Logf("Output: %s", output)
-					}
+					assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
 				},
 				ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
 					"categories.svg": func(t *testing.T, filePath, data string) {
@@ -543,6 +535,58 @@ func TestEvaluateExecute(t *testing.T) {
 		})
 	})
 
+	t.Run("Runs", func(t *testing.T) {
+		validate(t, &testCase{
+			Name: "Multiple",
+
+			Arguments: []string{
+				"--model", "symflower/symbolic-execution",
+				"--repository", filepath.Join("golang", "plain"),
+				"--runs=3",
+			},
+
+			ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) {
+				actualAssessments := validateMetrics(t, extractMetricsLogsMatch, output, []metrics.Assessments{
+					metrics.Assessments{
+						metrics.AssessmentKeyCoverageStatement: 30,
+						metrics.AssessmentKeyFilesExecuted:     3,
+						metrics.AssessmentKeyResponseNoError:   3,
+						metrics.AssessmentKeyResponseNoExcess:  3,
+						metrics.AssessmentKeyResponseWithCode:  3,
+					},
+				}, []uint{42})
+				assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint(0))
+				assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
+			},
+			ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
+				"categories.svg": nil,
+				"evaluation.csv": func(t *testing.T, filePath, data string) {
+					actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{
+						metrics.Assessments{
+							metrics.AssessmentKeyCoverageStatement: 30,
+							metrics.AssessmentKeyFilesExecuted:     3,
+							metrics.AssessmentKeyResponseNoError:   3,
+							metrics.AssessmentKeyResponseNoExcess:  3,
+							metrics.AssessmentKeyResponseWithCode:  3,
+						},
+					}, []uint{42})
+					assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint(0))
+				},
+				"evaluation.log": func(t *testing.T, filePath, data string) {
+					assert.Contains(t, data, "Run 1/3")
+					assert.Contains(t, data, "Run 2/3")
+					assert.Contains(t, data, "Run 3/3")
+				},
+				"golang-summed.csv": nil,
+				"models-summed.csv": nil,
+				"README.md":         nil,
+				filepath.Join("symflower_symbolic-execution", "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) {
+					assert.Equal(t, 3, strings.Count(data, `Evaluating model "symflower/symbolic-execution"`))
+				},
+			},
+		})
+	})
+
 	// This case checks a beautiful bug where the Markdown export crashed when the current working directory contained a README.md file. While this is not the case during the tests (as the current work directory is the directory of this file), it certainly caused problems when our binary was executed from the repository root (which of course contained a README.md). Therefore, we sadly have to modify the current work directory right within the tests of this case to reproduce the problem and fix it forever.
 	validate(t, &testCase{
 		Name: "Current work directory contains a README.md",