Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ type Evaluate struct {

// Runs holds the number of runs to perform.
Runs uint `long:"runs" description:"Number of runs to perform." default:"1"`
// RunsSequential indicates that interleaved runs are disabled and runs are performed sequentially.
RunsSequential bool `long:"runs-sequential" description:"By default, multiple runs are performed in an interleaved fashion to avoid caching of model responses. Changing this behavior to \"sequential\" queries the same model repeatedly instead."`

// logger holds the logger of the command.
logger *log.Logger
Expand Down Expand Up @@ -254,7 +256,8 @@ func (command *Evaluate) Execute(args []string) (err error) {
ResultPath: command.ResultPath,
TestdataPath: command.TestdataPath,

Runs: command.Runs,
Runs: command.Runs,
RunsSequential: command.RunsSequential,
})

assessmentsPerModel := assessments.CollapseByModel()
Expand Down
142 changes: 98 additions & 44 deletions evaluate/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,26 @@ type Context struct {

// Runs holds the number of runs to perform.
Runs uint
// RunsSequential indicates that interleaved runs are disabled and runs are performed sequentially.
RunsSequential bool
}

// runsAtLanguageLevel returns how many runs to perform on language level.
func (ctx *Context) runsAtLanguageLevel() uint {
if ctx.RunsSequential {
return 1
}

return ctx.Runs
}

// runsAtModelLevel returns how many runs to perform on model level.
func (ctx *Context) runsAtModelLevel() uint {
if ctx.RunsSequential {
return ctx.Runs
}

return 1
}

// RepositoryPlainName holds the name of the plain repository.
Expand All @@ -45,28 +65,34 @@ func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePer
// Ensure we report metrics for every model even if they are excluded.
assessments = report.NewAssessmentPerModelPerLanguagePerRepository(ctx.Models, ctx.Languages, ctx.RepositoryPaths)
problemsPerModel := map[string][]error{}

{
for r := uint(0); r < ctx.Runs; r++ {
if ctx.Runs > 1 {
ctx.Log.Printf("Run %d/%d", r+1, ctx.Runs)
// Create temporary repositories for each language so the repository is copied only once per language.
temporaryRepositories := map[string]string{}
for _, language := range ctx.Languages {
repositoryPath := filepath.Join(language.ID(), RepositoryPlainName)
temporaryRepositoryPath, cleanup, err := TemporaryRepository(ctx.Log, filepath.Join(ctx.TestdataPath, repositoryPath))
if err != nil {
ctx.Log.Panicf("ERROR: unable to create temporary repository path: %s", err)
}

defer cleanup()

temporaryRepositories[repositoryPath] = temporaryRepositoryPath
}
for rl := uint(0); rl < ctx.runsAtLanguageLevel(); rl++ {
if ctx.Runs > 1 && !ctx.RunsSequential {
ctx.Log.Printf("Run %d/%d", rl+1, ctx.Runs)
}

for _, language := range ctx.Languages {
languageID := language.ID()

repositoryPath := filepath.Join(languageID, RepositoryPlainName)
temporaryRepositoryPath, cleanup, err := TemporaryRepository(ctx.Log, filepath.Join(ctx.TestdataPath, repositoryPath))
if err != nil {
ctx.Log.Panicf("ERROR: unable to create temporary repository path: %s", err)
}
repositoryPath := filepath.Join(language.ID(), RepositoryPlainName)
temporaryRepositoryPath := temporaryRepositories[repositoryPath]

for _, model := range ctx.Models {
modelID := model.ID()

if err := ResetTemporaryRepository(ctx.Log, temporaryRepositoryPath); err != nil {
ctx.Log.Panicf("ERROR: unable to reset temporary repository path: %s", err)
}

if modelSucceededBasicChecksOfLanguage[model] == nil {
modelSucceededBasicChecksOfLanguage[model] = map[evallanguage.Language]bool{}
}
Expand All @@ -75,20 +101,28 @@ func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePer
r.SetQueryAttempts(ctx.QueryAttempts)
}

assessment, ps, err := Repository(ctx.Log, ctx.ResultPath, model, language, temporaryRepositoryPath, repositoryPath)
assessments[model][language][repositoryPath].Add(assessment)
if err != nil {
ps = append(ps, err)
}
if len(ps) > 0 {
ctx.Log.Printf("Model %q was not able to solve the %q repository for language %q: %+v", modelID, repositoryPath, languageID, ps)
problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
} else {
modelSucceededBasicChecksOfLanguage[model][language] = true
for rm := uint(0); rm < ctx.runsAtModelLevel(); rm++ {
if err := ResetTemporaryRepository(ctx.Log, temporaryRepositoryPath); err != nil {
ctx.Log.Panicf("ERROR: unable to reset temporary repository path: %s", err)
}

if ctx.Runs > 1 && ctx.RunsSequential {
ctx.Log.Printf("Run %d/%d for model %q", rm+1, ctx.Runs, modelID)
}

assessment, ps, err := Repository(ctx.Log, ctx.ResultPath, model, language, temporaryRepositoryPath, repositoryPath)
assessments[model][language][repositoryPath].Add(assessment)
if err != nil {
ps = append(ps, err)
}
if len(ps) > 0 {
ctx.Log.Printf("Model %q was not able to solve the %q repository for language %q: %+v", modelID, repositoryPath, languageID, ps)
problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
} else {
modelSucceededBasicChecksOfLanguage[model][language] = true
}
}
}

cleanup() // Remove temporary directory.
}
}
}
Expand All @@ -100,9 +134,29 @@ func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePer

// Evaluating models and languages.
ctx.Log.Printf("Evaluating models and languages")
for r := uint(0); r < ctx.Runs; r++ {
if ctx.Runs > 1 {
ctx.Log.Printf("Run %d/%d", r+1, ctx.Runs)
// Create temporary repositories for each language so the repository is copied only once per language.
temporaryRepositories := map[string]string{}
for _, language := range ctx.Languages {
languagePath := filepath.Join(ctx.TestdataPath, language.ID())
repositories, err := os.ReadDir(languagePath)
if err != nil {
ctx.Log.Panicf("ERROR: language path %q cannot be accessed: %s", languagePath, err)
}
for _, repository := range repositories {
repositoryPath := filepath.Join(language.ID(), repository.Name())
temporaryRepositoryPath, cleanup, err := TemporaryRepository(ctx.Log, filepath.Join(ctx.TestdataPath, repositoryPath))
if err != nil {
ctx.Log.Panicf("ERROR: unable to create temporary repository path: %s", err)
}

defer cleanup()

temporaryRepositories[repositoryPath] = temporaryRepositoryPath
}
}
for rl := uint(0); rl < ctx.runsAtLanguageLevel(); rl++ {
if ctx.Runs > 1 && !ctx.RunsSequential {
ctx.Log.Printf("Run %d/%d", rl+1, ctx.Runs)
}

for _, language := range ctx.Languages {
Expand All @@ -116,6 +170,7 @@ func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePer

for _, repository := range repositories {
repositoryPath := filepath.Join(languageID, repository.Name())
temporaryRepositoryPath := temporaryRepositories[repositoryPath]

if !repository.IsDir() || (len(ctx.RepositoryPaths) > 0 && !repositoriesLookup[repositoryPath]) {
continue
Expand All @@ -126,33 +181,32 @@ func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePer
continue
}

temporaryRepositoryPath, cleanup, err := TemporaryRepository(ctx.Log, filepath.Join(ctx.TestdataPath, repositoryPath))
if err != nil {
ctx.Log.Panicf("ERROR: unable to create temporary repository path: %s", err)
}

for _, model := range ctx.Models {
modelID := model.ID()

if err := ResetTemporaryRepository(ctx.Log, temporaryRepositoryPath); err != nil {
ctx.Log.Panicf("ERROR: unable to reset temporary repository path: %s", err)
}

if !modelSucceededBasicChecksOfLanguage[model][language] {
log.Printf("Excluding model %q for language %q cause it did not succeed basic checks", model.ID(), language.ID())

continue
}

assessment, ps, err := Repository(ctx.Log, ctx.ResultPath, model, language, temporaryRepositoryPath, repositoryPath)
assessments[model][language][repositoryPath].Add(assessment)
problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
if err != nil {
ctx.Log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err)
for rm := uint(0); rm < ctx.runsAtModelLevel(); rm++ {
if ctx.Runs > 1 && ctx.RunsSequential {
ctx.Log.Printf("Run %d/%d for model %q", rm+1, ctx.Runs, modelID)
}

if err := ResetTemporaryRepository(ctx.Log, temporaryRepositoryPath); err != nil {
ctx.Log.Panicf("ERROR: unable to reset temporary repository path: %s", err)
}

assessment, ps, err := Repository(ctx.Log, ctx.ResultPath, model, language, temporaryRepositoryPath, repositoryPath)
assessments[model][language][repositoryPath].Add(assessment)
problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
if err != nil {
ctx.Log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err)
}
}
}

cleanup() // Remove temporary directory.
}
}
}
Expand Down
113 changes: 113 additions & 0 deletions evaluate/evaluate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -507,4 +507,117 @@ func TestEvaluate(t *testing.T) {
})
}
})
t.Run("Runs", func(t *testing.T) {
generateTestsForFilePlainSuccess := func(args mock.Arguments) {
require.NoError(t, os.WriteFile(filepath.Join(args.String(2), "plain_test.go"), []byte("package plain\nimport \"testing\"\nfunc TestFunction(t *testing.T){}"), 0600))
}
generateTestsForFilePlainSuccessMetrics := metrics.Assessments{
metrics.AssessmentKeyProcessingTime: 1,
}
generateSuccess := func(mockedModel *modeltesting.MockModel) {
mockedModel.On("GenerateTestsForFile", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(generateTestsForFilePlainSuccessMetrics, nil).Run(generateTestsForFilePlainSuccess)
}
{
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockModelNamed(t, mockedModelID)
repositoryPath := filepath.Join("golang", "plain")
validate(t, &testCase{
Name: "Interleaved",

Before: func(t *testing.T, logger *log.Logger, resultPath string) {
generateSuccess(mockedModel)
},

Context: &Context{
Languages: []language.Language{
&golang.Language{},
},

Models: []evalmodel.Model{
mockedModel,
},

RepositoryPaths: []string{
repositoryPath,
},

Runs: 3,
RunsSequential: false,
},

ExpectedAssessments: map[evalmodel.Model]map[language.Language]map[string]metrics.Assessments{
mockedModel: map[language.Language]map[string]metrics.Assessments{
languageGolang: map[string]metrics.Assessments{
repositoryPath: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyResponseNoError: 3,
},
},
},
},
ExpectedTotalScore: 3,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
},
ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) {
assert.Contains(t, output, "Run 1/3")
assert.Contains(t, output, "Run 2/3")
assert.Contains(t, output, "Run 3/3")
assert.NotRegexp(t, `Run \d+/\d+ for model`, output)
},
})
}
{
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockModelNamed(t, mockedModelID)
repositoryPath := filepath.Join("golang", "plain")
validate(t, &testCase{
Name: "Sequential",

Before: func(t *testing.T, logger *log.Logger, resultPath string) {
generateSuccess(mockedModel)
},

Context: &Context{
Languages: []language.Language{
&golang.Language{},
},

Models: []evalmodel.Model{
mockedModel,
},

RepositoryPaths: []string{
repositoryPath,
},

Runs: 3,
RunsSequential: true,
},

ExpectedAssessments: map[evalmodel.Model]map[language.Language]map[string]metrics.Assessments{
mockedModel: map[language.Language]map[string]metrics.Assessments{
languageGolang: map[string]metrics.Assessments{
repositoryPath: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyResponseNoError: 3,
},
},
},
},
ExpectedTotalScore: 3,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
},
ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) {
assert.Contains(t, output, "Run 1/3 for model")
assert.Contains(t, output, "Run 2/3 for model")
assert.Contains(t, output, "Run 3/3 for model")
assert.NotRegexp(t, `Run \d+/\d+$`, output)
},
})
}
})
}