Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 55 additions & 37 deletions cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ type Evaluate struct {
Repositories []string `long:"repository" description:"Evaluate with this repository. By default all repositories are used."`
// ResultPath holds the directory path where results should be written to.
ResultPath string `long:"result-path" description:"Directory path where results should be written to. The placeholder \"%datetime%\" can be used for the current date and time." default:"evaluation-%datetime%"`
// Runs holds the number of runs to perform.
Runs uint `long:"runs" description:"Number of runs to perform." default:"1"`
// TestdataPath determines the testdata path where all repositories reside grouped by languages.
TestdataPath string `long:"testdata" description:"Path to the testdata directory where all repositories reside grouped by languages." default:"testdata/"`

Expand Down Expand Up @@ -97,6 +99,10 @@ func (command *Evaluate) Execute(args []string) (err error) {
if command.SymflowerBinaryPath != "" {
tools.SymflowerPath = command.SymflowerBinaryPath
}

if command.Runs == 0 {
log.Panicf("number of configured runs is 0")
}
}

// Gather languages.
Expand Down Expand Up @@ -229,60 +235,72 @@ func (command *Evaluate) Execute(args []string) (err error) {
assessments := report.NewAssessmentPerModelPerLanguagePerRepository(maps.Values(modelsSelected), maps.Values(languagesSelected), command.Repositories)
problemsPerModel := map[string][]error{}
{
for _, languageID := range command.Languages {
for _, modelID := range command.Models {
model := modelsSelected[modelID]
language := languagesSelected[languageID]
for r := uint(0); r < command.Runs; r++ {
if command.Runs > 1 {
log.Printf("Run %d/%d", r+1, command.Runs)
}

repositoryPath := filepath.Join(languageID, repositoryPlainName)
for _, languageID := range command.Languages {
for _, modelID := range command.Models {
model := modelsSelected[modelID]
language := languagesSelected[languageID]

assessment, ps, err := evaluate.Repository(command.logger, command.ResultPath, model, language, command.TestdataPath, repositoryPath)
assessments[model][language][repositoryPath].Add(assessment)
if err != nil {
ps = append(ps, err)
}
if len(ps) > 0 {
log.Printf("Excluding model %q since it was not able to solve the %q repository for language %q: %+v", modelID, repositoryPath, languageID, ps)
problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
repositoryPath := filepath.Join(languageID, repositoryPlainName)

assessment, ps, err := evaluate.Repository(command.logger, command.ResultPath, model, language, command.TestdataPath, repositoryPath)
assessments[model][language][repositoryPath].Add(assessment)
if err != nil {
ps = append(ps, err)
}
if len(ps) > 0 {
log.Printf("Excluding model %q since it was not able to solve the %q repository for language %q: %+v", modelID, repositoryPath, languageID, ps)
problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
}
}
}
}
}

// Evaluating models and languages.
log.Printf("Evaluating models and languages")
for _, languageID := range command.Languages {
languagePath := filepath.Join(command.TestdataPath, languageID)
repositories, err := os.ReadDir(languagePath)
if err != nil {
log.Panicf("ERROR: language path %q cannot be accessed: %s", languagePath, err)
for r := uint(0); r < command.Runs; r++ {
if command.Runs > 1 {
log.Printf("Run %d/%d", r+1, command.Runs)
}

for _, repository := range repositories {
repositoryPath := filepath.Join(languageID, repository.Name())

if !repository.IsDir() || (len(commandRepositories) > 0 && !commandRepositories[repositoryPath]) {
continue
for _, languageID := range command.Languages {
languagePath := filepath.Join(command.TestdataPath, languageID)
repositories, err := os.ReadDir(languagePath)
if err != nil {
log.Panicf("ERROR: language path %q cannot be accessed: %s", languagePath, err)
}

// Do not include "plain" repositories in this step of the evaluation, because they have been checked with the common check before.
if repository.Name() == repositoryPlainName {
continue
}
for _, repository := range repositories {
repositoryPath := filepath.Join(languageID, repository.Name())

for _, modelID := range command.Models {
if len(problemsPerModel[modelID]) > 0 {
if !repository.IsDir() || (len(commandRepositories) > 0 && !commandRepositories[repositoryPath]) {
continue
}

// Do not include "plain" repositories in this step of the evaluation, because they have been checked with the common check before.
if repository.Name() == repositoryPlainName {
continue
}

model := modelsSelected[modelID]
language := languagesSelected[languageID]
for _, modelID := range command.Models {
if len(problemsPerModel[modelID]) > 0 {
continue
}

assessment, ps, err := evaluate.Repository(command.logger, command.ResultPath, model, language, command.TestdataPath, repositoryPath)
assessments[model][language][repositoryPath].Add(assessment)
problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
if err != nil {
log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err)
model := modelsSelected[modelID]
language := languagesSelected[languageID]

assessment, ps, err := evaluate.Repository(command.logger, command.ResultPath, model, language, command.TestdataPath, repositoryPath)
assessments[model][language][repositoryPath].Add(assessment)
problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
if err != nil {
log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err)
}
}
}
}
Expand All @@ -299,7 +317,7 @@ func (command *Evaluate) Execute(args []string) (err error) {
}
}
if isOnlyPlainRepositories {
totalScore = uint(len(languagesSelected))
totalScore = uint(len(languagesSelected)) * command.Runs
}

assessmentsPerModel := assessments.CollapseByModel()
Expand Down
68 changes: 56 additions & 12 deletions cmd/eval-dev-quality/cmd/evaluate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,9 +198,7 @@ func TestEvaluateExecute(t *testing.T) {
},
}, []uint{14})
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint(0))
if !assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) {
t.Logf("Output: %s", output)
}
assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"categories.svg": func(t *testing.T, filePath, data string) {
Expand Down Expand Up @@ -267,9 +265,7 @@ func TestEvaluateExecute(t *testing.T) {
},
}, []uint{28})
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint(0))
if !assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) {
t.Logf("Output: %s", output)
}
assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"categories.svg": func(t *testing.T, filePath, data string) {
Expand Down Expand Up @@ -363,9 +359,7 @@ func TestEvaluateExecute(t *testing.T) {
},
}, []uint{14})
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint(0))
if !assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) {
t.Logf("Output: %s", output)
}
assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"categories.svg": func(t *testing.T, filePath, data string) {
Expand Down Expand Up @@ -424,9 +418,7 @@ func TestEvaluateExecute(t *testing.T) {

ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) {
assert.Regexp(t, `Evaluation score for "symflower/symbolic-execution" \("code-no-excess"\): score=14, coverage-statement=10, files-executed=1, processing-time=\d+, response-no-error=1, response-no-excess=1, response-with-code=1`, output)
if !assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) {
t.Logf("Output: %s", output)
}
assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"categories.svg": func(t *testing.T, filePath, data string) {
Expand Down Expand Up @@ -543,6 +535,58 @@ func TestEvaluateExecute(t *testing.T) {
})
})

t.Run("Runs", func(t *testing.T) {
validate(t, &testCase{
Name: "Multiple",

Arguments: []string{
"--model", "symflower/symbolic-execution",
"--repository", filepath.Join("golang", "plain"),
"--runs=3",
},

ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) {
actualAssessments := validateMetrics(t, extractMetricsLogsMatch, output, []metrics.Assessments{
metrics.Assessments{
metrics.AssessmentKeyCoverageStatement: 30,
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyResponseNoError: 3,
metrics.AssessmentKeyResponseNoExcess: 3,
metrics.AssessmentKeyResponseWithCode: 3,
},
}, []uint{42})
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint(0))
assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"categories.svg": nil,
"evaluation.csv": func(t *testing.T, filePath, data string) {
actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{
metrics.Assessments{
metrics.AssessmentKeyCoverageStatement: 30,
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyResponseNoError: 3,
metrics.AssessmentKeyResponseNoExcess: 3,
metrics.AssessmentKeyResponseWithCode: 3,
},
}, []uint{42})
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint(0))
},
"evaluation.log": func(t *testing.T, filePath, data string) {
assert.Contains(t, data, "Run 1/3")
assert.Contains(t, data, "Run 2/3")
assert.Contains(t, data, "Run 3/3")
},
"golang-summed.csv": nil,
"models-summed.csv": nil,
"README.md": nil,
filepath.Join("symflower_symbolic-execution", "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) {
assert.Equal(t, 3, strings.Count(data, `Evaluating model "symflower/symbolic-execution"`))
},
},
})
})

// This case checks a beautiful bug where the Markdown export crashed when the current working directory contained a README.md file. While this is not the case during the tests (as the current work directory is the directory of this file), it certainly caused problems when our binary was executed from the repository root (which of course contained a README.md). Therefore, we sadly have to modify the current work directory right within the tests of this case to reproduce the problem and fix it forever.
validate(t, &testCase{
Name: "Current work directory contains a README.md",
Expand Down