Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ type Evaluate struct {
// ResultPath holds the directory path where results should be written to.
ResultPath string `long:"result-path" description:"Directory path where results should be written to. The placeholder \"%datetime%\" can be used for the current date and time." default:"evaluation-%datetime%"`
// Runs holds the number of runs to perform.
Runs uint `long:"runs" description:"Number of runs to perform." default:"1"`
Runs uint64 `long:"runs" description:"Number of runs to perform." default:"1"`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commit subject: not for longer values, but to be consistent explicitly.

// TestdataPath determines the testdata path where all repositories reside grouped by languages.
TestdataPath string `long:"testdata" description:"Path to the testdata directory where all repositories reside grouped by languages." default:"testdata/"`

Expand Down Expand Up @@ -235,7 +235,7 @@ func (command *Evaluate) Execute(args []string) (err error) {
assessments := report.NewAssessmentPerModelPerLanguagePerRepository(maps.Values(modelsSelected), maps.Values(languagesSelected), command.Repositories)
problemsPerModel := map[string][]error{}
{
for r := uint(0); r < command.Runs; r++ {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Early merging this commit. @ruiAzevedo19 when you do an early merger, please always make sure that the commits you are adding are working. don't work on features in an early merger, instead, early merge.

for r := uint64(0); r < command.Runs; r++ {
if command.Runs > 1 {
log.Printf("Run %d/%d", r+1, command.Runs)
}
Expand Down Expand Up @@ -263,7 +263,7 @@ func (command *Evaluate) Execute(args []string) (err error) {

// Evaluating models and languages.
log.Printf("Evaluating models and languages")
for r := uint(0); r < command.Runs; r++ {
for r := uint64(0); r < command.Runs; r++ {
if command.Runs > 1 {
log.Printf("Run %d/%d", r+1, command.Runs)
}
Expand Down Expand Up @@ -306,7 +306,7 @@ func (command *Evaluate) Execute(args []string) (err error) {
}
}

totalScore := uint(0)
totalScore := uint64(0)
// Set the total score to the number of evaluated languages if we are just checking the "plain" repositories since there is only one task to solve per language.
isOnlyPlainRepositories := true
for repository := range commandRepositories {
Expand All @@ -317,7 +317,7 @@ func (command *Evaluate) Execute(args []string) (err error) {
}
}
if isOnlyPlainRepositories {
totalScore = uint(len(languagesSelected)) * command.Runs
totalScore = uint64(len(languagesSelected)) * command.Runs
}

assessmentsPerModel := assessments.CollapseByModel()
Expand All @@ -336,7 +336,7 @@ func (command *Evaluate) Execute(args []string) (err error) {
return err
}

_ = assessmentsPerModel.WalkByScore(func(model model.Model, assessment metrics.Assessments, score uint) (err error) {
_ = assessmentsPerModel.WalkByScore(func(model model.Model, assessment metrics.Assessments, score uint64) (err error) {
log.Printf("Evaluation score for %q (%q): %s", model.ID(), assessment.Category(totalScore).ID, assessment)

return nil
Expand Down
288 changes: 144 additions & 144 deletions cmd/eval-dev-quality/cmd/evaluate_test.go

Large diffs are not rendered by default.

29 changes: 17 additions & 12 deletions evaluate/metrics/assessment.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,19 @@ var (
// AllAssessmentKeysStrings returns all registered assessment keys as strings.
AllAssessmentKeysStrings []string

// pointsPerAssessment holds the points awarded for a specific assessment.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

COmmit subject "Redo the coverage logic" same rules as internally. What, and why.

pointsPerAssessment = map[AssessmentKey]uint{}
// multiplierPerAssessment holds the multipliers awarded for a specific assessment.
multiplierPerAssessment = map[AssessmentKey]uint64{}
)

// RegisterAssessmentKey registers a new assessment key.
// If the points for this assessment type are zero, it is ignored for the score computation.
func RegisterAssessmentKey(key string, points uint) AssessmentKey {
// If the multiplier for this assessment type is zero, it is ignored for the score computation.
func RegisterAssessmentKey(key string, multiplier uint64) AssessmentKey {
assessment := AssessmentKey(key)
i := sort.SearchStrings(AllAssessmentKeysStrings, key)

allAssessmentKeys = slices.Insert(allAssessmentKeys, i, assessment)
AllAssessmentKeysStrings = slices.Insert(AllAssessmentKeysStrings, i, key)
pointsPerAssessment[assessment] = points
multiplierPerAssessment[assessment] = multiplier

return assessment
}
Expand All @@ -39,8 +39,8 @@ var (
// AssessmentKeyProcessingTime holds the time in milliseconds that it took to complete the task.
AssessmentKeyProcessingTime = RegisterAssessmentKey("processing-time", 0)

// AssessmentKeyCoverageStatement counts the cases where 100% coverage was reached.
AssessmentKeyCoverageStatement = RegisterAssessmentKey("coverage-statement", 10)
// AssessmentKeyCoverage counts execution coverage objects.
AssessmentKeyCoverage = RegisterAssessmentKey("coverage", 2)

// AssessmentKeyResponseNoError indicates that a model responded without error.
AssessmentKeyResponseNoError = RegisterAssessmentKey("response-no-error", 1)
Expand All @@ -52,11 +52,11 @@ var (
)

// Assessments holds a collection of numerical assessment metrics.
type Assessments map[AssessmentKey]uint
type Assessments map[AssessmentKey]uint64

// NewAssessments creates a new assessment collection.
func NewAssessments() Assessments {
return map[AssessmentKey]uint{}
return map[AssessmentKey]uint64{}
}

// Add adds the given assessment collection to the current one.
Expand Down Expand Up @@ -95,13 +95,13 @@ func Merge(a Assessments, b Assessments) (c Assessments) {
}

// Score computes the score over all assessments in the collection.
func (a Assessments) Score() (score uint) {
func (a Assessments) Score() (score uint64) {
if len(a) == 0 {
return 0
}

for key, value := range a {
if pointsPerAssessment[key] != 0 {
if multiplierPerAssessment[key] != 0 {
score += value
}
}
Expand All @@ -111,7 +111,12 @@ func (a Assessments) Score() (score uint) {

// Award yields the score points defined for the given key.
func (a Assessments) Award(key AssessmentKey) {
a[key] += pointsPerAssessment[key]
a[key] += multiplierPerAssessment[key]
}

// Award yields multiple score points defined for the given key.
func (a Assessments) AwardPoints(key AssessmentKey, count uint64) {
a[key] += multiplierPerAssessment[key] * count
}

// String returns a string representation of the metrics.
Expand Down
44 changes: 22 additions & 22 deletions evaluate/metrics/assessment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,26 +37,26 @@ func TestAssessmentsAdd(t *testing.T) {
Name: "Non existing key",

Assessments: NewAssessments(),
X: map[AssessmentKey]uint{
X: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 1,
},

ExpectedAssessments: map[AssessmentKey]uint{
ExpectedAssessments: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 1,
},
})

validate(t, &testCase{
Name: "Existing key",

Assessments: map[AssessmentKey]uint{
Assessments: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 1,
},
X: map[AssessmentKey]uint{
X: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 1,
},

ExpectedAssessments: map[AssessmentKey]uint{
ExpectedAssessments: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 2,
},
})
Expand Down Expand Up @@ -90,26 +90,26 @@ func TestAssessmentsMerge(t *testing.T) {
Name: "Non existing key",

A: NewAssessments(),
B: map[AssessmentKey]uint{
B: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 1,
},

ExpectedC: map[AssessmentKey]uint{
ExpectedC: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 1,
},
})

validate(t, &testCase{
Name: "Existing key",

A: map[AssessmentKey]uint{
A: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 1,
},
B: map[AssessmentKey]uint{
B: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 1,
},

ExpectedC: map[AssessmentKey]uint{
ExpectedC: map[AssessmentKey]uint64{
AssessmentKeyResponseNoExcess: 2,
},
})
Expand Down Expand Up @@ -144,12 +144,12 @@ func TestAssessmentString(t *testing.T) {
Name: "Non-empty Metrics",

Assessment: Assessments{
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyResponseNoError: 3,
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseWithCode: 5,
AssessmentKeyProcessingTime: 200,
AssessmentKeyCoverage: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyResponseNoError: 3,
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseWithCode: 5,
AssessmentKeyProcessingTime: 200,
},

ExpectedString: "score=15, coverage-statement=1, files-executed=2, processing-time=200, response-no-error=3, response-no-excess=4, response-with-code=5",
Expand Down Expand Up @@ -239,7 +239,7 @@ func TestAssessmentsScore(t *testing.T) {

Assessments Assessments

ExpectedScore uint
ExpectedScore uint64
}

validate := func(t *testing.T, tc *testCase) {
Expand All @@ -255,18 +255,18 @@ func TestAssessmentsScore(t *testing.T) {

Assessments: NewAssessments(),

ExpectedScore: 0,
ExpectedScore: uint64(0),
})

validate(t, &testCase{
Name: "Values Assessment",

Assessments: Assessments{
AssessmentKeyFilesExecuted: 5,
AssessmentKeyCoverageStatement: 4,
AssessmentKeyProcessingTime: 200,
AssessmentKeyFilesExecuted: 5,
AssessmentKeyCoverage: 4,
AssessmentKeyProcessingTime: 200,
},

ExpectedScore: 9,
ExpectedScore: uint64(9),
})
}
12 changes: 6 additions & 6 deletions evaluate/metrics/category.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,21 +76,21 @@ var (
// Category infers a categorical ranking of a model based on assessment values.
// A models overall category corresponds to the criterion where the model was consistently able to receive "total" amount of points. I.e. if there were 3 tasks in total and a model was able to produce executing code for all tasks, but only in one case the coverage goal was reached, then the category is only "CodeExecuted" because the coverage goal was not reached consistently.
// The returned category is never "nil".
func (a Assessments) Category(totalTasks uint) *AssessmentCategory {
func (a Assessments) Category(totalTasks uint64) *AssessmentCategory {
if totalTasks == 0 {
return AssessmentCategoryUnknown
}

switch {
case a[AssessmentKeyResponseNoError] != totalTasks*pointsPerAssessment[AssessmentKeyResponseNoError]:
case a[AssessmentKeyResponseNoError] != totalTasks*multiplierPerAssessment[AssessmentKeyResponseNoError]:
return AssessmentCategoryResponseError
case a[AssessmentKeyResponseWithCode] != totalTasks*pointsPerAssessment[AssessmentKeyResponseWithCode] && a[AssessmentKeyFilesExecuted] != totalTasks*pointsPerAssessment[AssessmentKeyFilesExecuted]: // TODO We cannot always detect yet if a model response contains source code, so ensure we don't categorize into "no code" if the code actually ran successfully all the time. https://github.com/symflower/eval-dev-quality/issues/43
case a[AssessmentKeyResponseWithCode] != totalTasks*multiplierPerAssessment[AssessmentKeyResponseWithCode] && a[AssessmentKeyFilesExecuted] != totalTasks*multiplierPerAssessment[AssessmentKeyFilesExecuted]: // TODO We cannot always detect yet if a model response contains source code, so ensure we don't categorize into "no code" if the code actually ran successfully all the time. https://github.com/symflower/eval-dev-quality/issues/43
return AssessmentCategoryResponseNoCode
case a[AssessmentKeyFilesExecuted] != totalTasks*pointsPerAssessment[AssessmentKeyFilesExecuted]:
case a[AssessmentKeyFilesExecuted] != totalTasks*multiplierPerAssessment[AssessmentKeyFilesExecuted]:
return AssessmentCategoryCodeInvalid
case a[AssessmentKeyCoverageStatement] != totalTasks*pointsPerAssessment[AssessmentKeyCoverageStatement]:
case a[AssessmentKeyCoverage] != totalTasks*multiplierPerAssessment[AssessmentKeyCoverage]:
return AssessmentCategoryCodeExecuted
case a[AssessmentKeyResponseNoExcess] != totalTasks*pointsPerAssessment[AssessmentKeyResponseNoExcess]:
case a[AssessmentKeyResponseNoExcess] != totalTasks*multiplierPerAssessment[AssessmentKeyResponseNoExcess]:
return AssessmentCategoryCodeCoverageStatementReached
default:
return AssessmentCategoryCodeNoExcess
Expand Down
30 changes: 15 additions & 15 deletions evaluate/metrics/category_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ func TestAssessmentsCategory(t *testing.T) {
Name string

Assessments Assessments
Total uint
Total uint64

ExpectedAssessmentCategory *AssessmentCategory
}
Expand Down Expand Up @@ -94,10 +94,10 @@ func TestAssessmentsCategory(t *testing.T) {
Name: "Full Statement Coverage",

Assessments: Assessments{
AssessmentKeyResponseNoError: 1,
AssessmentKeyResponseWithCode: 1,
AssessmentKeyFilesExecuted: 1,
AssessmentKeyCoverageStatement: 10,
AssessmentKeyResponseNoError: 1,
AssessmentKeyResponseWithCode: 1,
AssessmentKeyFilesExecuted: 1,
AssessmentKeyCoverage: 10,
},
Total: 1,

Expand All @@ -108,11 +108,11 @@ func TestAssessmentsCategory(t *testing.T) {
Name: "No Excess",

Assessments: Assessments{
AssessmentKeyResponseNoError: 1,
AssessmentKeyResponseWithCode: 1,
AssessmentKeyFilesExecuted: 1,
AssessmentKeyCoverageStatement: 10,
AssessmentKeyResponseNoExcess: 1,
AssessmentKeyResponseNoError: 1,
AssessmentKeyResponseWithCode: 1,
AssessmentKeyFilesExecuted: 1,
AssessmentKeyCoverage: 10,
AssessmentKeyResponseNoExcess: 1,
},
Total: 1,

Expand All @@ -123,11 +123,11 @@ func TestAssessmentsCategory(t *testing.T) {
Name: "Inconsistent",

Assessments: Assessments{
AssessmentKeyResponseNoError: 2,
AssessmentKeyResponseWithCode: 2,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyCoverageStatement: 1,
AssessmentKeyResponseNoExcess: 0,
AssessmentKeyResponseNoError: 2,
AssessmentKeyResponseWithCode: 2,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyCoverage: 1,
AssessmentKeyResponseNoExcess: 0,
},
Total: 2,

Expand Down
4 changes: 2 additions & 2 deletions evaluate/report/collection.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ type AssessmentPerLanguagePerModel map[language.Language]AssessmentPerModel
type AssessmentPerModel map[model.Model]metrics.Assessments

// WalkByScore walks the given assessment metrics by their score.
func (a AssessmentPerModel) WalkByScore(function func(model model.Model, assessment metrics.Assessments, score uint) error) (err error) {
func (a AssessmentPerModel) WalkByScore(function func(model model.Model, assessment metrics.Assessments, score uint64) error) (err error) {
models := maps.Keys(a)
slices.SortStableFunc(models, func(a, b model.Model) int {
return cmp.Compare(a.ID(), b.ID())
})

scores := make(map[model.Model]uint, len(models))
scores := make(map[model.Model]uint64, len(models))
for _, model := range models {
scores[model] = a[model].Score()
}
Expand Down
12 changes: 6 additions & 6 deletions evaluate/report/collection_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ func TestWalkByScore(t *testing.T) {
AssessmentPerModel AssessmentPerModel

ExpectedModelOrder []model.Model
ExpectedScoreOrder []uint
ExpectedScoreOrder []uint64
}

validate := func(t *testing.T, tc *testCase) {
Expand All @@ -148,8 +148,8 @@ func TestWalkByScore(t *testing.T) {

actualModelOrder := make([]model.Model, 0, len(tc.ExpectedModelOrder))
actualAssessmentOrder := make([]metrics.Assessments, 0, len(tc.ExpectedModelOrder))
actualScoreOrder := make([]uint, 0, len(tc.ExpectedScoreOrder))
assert.NoError(t, tc.AssessmentPerModel.WalkByScore(func(model model.Model, assessment metrics.Assessments, score uint) (err error) {
actualScoreOrder := make([]uint64, 0, len(tc.ExpectedScoreOrder))
assert.NoError(t, tc.AssessmentPerModel.WalkByScore(func(model model.Model, assessment metrics.Assessments, score uint64) (err error) {
actualModelOrder = append(actualModelOrder, model)
actualAssessmentOrder = append(actualAssessmentOrder, assessment)
actualScoreOrder = append(actualScoreOrder, score)
Expand All @@ -175,7 +175,7 @@ func TestWalkByScore(t *testing.T) {
AssessmentPerModel: AssessmentPerModel{},

ExpectedModelOrder: []model.Model{},
ExpectedScoreOrder: []uint{},
ExpectedScoreOrder: []uint64{},
})

validate(t, &testCase{
Expand All @@ -190,7 +190,7 @@ func TestWalkByScore(t *testing.T) {
ExpectedModelOrder: []model.Model{
modelA,
},
ExpectedScoreOrder: []uint{
ExpectedScoreOrder: []uint64{
1,
},
})
Expand All @@ -215,7 +215,7 @@ func TestWalkByScore(t *testing.T) {
modelB,
modelC,
},
ExpectedScoreOrder: []uint{
ExpectedScoreOrder: []uint64{
1,
2,
3,
Expand Down
Loading