symflower · bauersimon · Jun 4, 2024 · May 20, 2024 · May 28, 2024 · May 20, 2024
diff --git a/README.md b/README.md
@@ -128,7 +128,7 @@ DONE 1 tests in 0.348s
 Total coverage 100.000000%
 2024/05/02 10:02:09 Evaluated model "openrouter/meta-llama/llama-3-70b-instruct" using language "java" and repository "java/plain": encountered 0 problems: []
 2024/05/02 10:02:09 Evaluating models and languages
-2024/05/02 10:02:09 Evaluation score for "openrouter/meta-llama/llama-3-70b-instruct" ("code-no-excess"): score=12, coverage-statement=2, files-executed=2, response-no-error=2, response-no-excess=2, response-not-empty=2, response-with-code=2
+2024/05/02 10:02:09 Evaluation score for "openrouter/meta-llama/llama-3-70b-instruct" ("code-no-excess"): score=12, coverage=2, files-executed=2, response-no-error=2, response-no-excess=2, response-not-empty=2, response-with-code=2
 ````
 
 </details>
@@ -139,8 +139,8 @@ The execution by default also creates an report file `REPORT.md` that contains a
 
 With `DevQualityEval` we answer answer the following questions:
 
-- Which LLMs can solve software development tasks?
-- How good is the quality of their results?
+-   Which LLMs can solve software development tasks?
+-   How good is the quality of their results?
 
 Programming is a non-trivial profession. Even writing tests for an empty function requires substantial knowledge of the used programming language and its conventions. We already investigated this challenge and how many LLMs failed at it in our [first `DevQualityEval` report](https://symflower.com/en/company/blog/2024/can-ai-test-a-go-function-that-does-nothing/#why-evaluate-an-empty-function). This highlights the need for a **benchmarking framework for evaluating AI performance on software development task solving**.
 
@@ -165,21 +165,21 @@ On a high level, `DevQualityEval` asks the model to produce tests for an example
 
 Currently, the following points are awarded for this task:
 
-- `response-no-error`: `+1` if the response did not encounter an error
-- `response-not-empty`: `+1` if the response is not empty
-- `response-with-code`: `+1` if the response contained source code
-- `compiled`: `+1` if the source code compiled
-- `statement-coverage-reached`: `+10` if the generated tests reach 100% coverage
-- `no-excess`: `+1` if the response did not contain more content than requested
+-   `response-no-error`: `+1` if the response did not encounter an error
+-   `response-not-empty`: `+1` if the response is not empty
+-   `response-with-code`: `+1` if the response contained source code
+-   `compiled`: `+1` if the source code compiled
+-   `statement-coverage-reached`: `+10` if the generated tests reach 100% coverage
+-   `no-excess`: `+1` if the response did not contain more content than requested
 
 #### Cases
 
 Currently, the following cases are available for this task:
 
-- Java
-  - `plain/src/main/java/plain.java`: An empty function that does nothing.
-- Go
-  - `plain/plain.go`: An empty function that does nothing.
+-   Java
+    -   `plain/src/main/java/plain.java`: An empty function that does nothing.
+-   Go
+    -   `plain/plain.go`: An empty function that does nothing.
 
 ## Results
 

diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -62,6 +62,8 @@ type Evaluate struct {
 	Runs uint `long:"runs" description:"Number of runs to perform." default:"1"`
 	// RunsSequential indicates that interleaved runs are disabled and runs are performed sequentially.
 	RunsSequential bool `long:"runs-sequential" description:"By default, multiple runs are performed in an interleaved fashion to avoid caching of model responses. Changing this behavior to \"sequential\" queries the same model repeatedly instead."`
+	// NoDisqualification indicates that models are not to be disqualified if they fail to solve basic language tasks.
+	NoDisqualification bool `long:"no-disqualification" description:"By default, models that cannot solve basic language tasks are disqualified for more complex tasks. Overwriting this behavior runs all tasks regardless."`
 
 	// logger holds the logger of the command.
 	logger *log.Logger
@@ -292,8 +294,9 @@ func (command *Evaluate) Execute(args []string) (err error) {
 		ResultPath:      command.ResultPath,
 		TestdataPath:    command.TestdataPath,
 
-		Runs:           command.Runs,
-		RunsSequential: command.RunsSequential,
+		Runs:               command.Runs,
+		RunsSequential:     command.RunsSequential,
+		NoDisqualification: command.NoDisqualification,
 	})
 
 	assessmentsPerModel := assessments.CollapseByModel()

diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go
diff --git a/evaluate/evaluate.go b/evaluate/evaluate.go
@@ -1,12 +1,12 @@
 package evaluate
 
 import (
-	"log"
 	"os"
 	"path/filepath"
 
 	"github.com/symflower/eval-dev-quality/evaluate/report"
 	evallanguage "github.com/symflower/eval-dev-quality/language"
+	"github.com/symflower/eval-dev-quality/log"
 	evalmodel "github.com/symflower/eval-dev-quality/model"
 	"github.com/symflower/eval-dev-quality/provider"
 )
@@ -37,6 +37,8 @@ type Context struct {
 	Runs uint
 	// RunsSequential indicates that interleaved runs are disabled and runs are performed sequentially.
 	RunsSequential bool
+	// NoDisqualification indicates that models are not to be disqualified if they fail to solve basic language tasks.
+	NoDisqualification bool
 }
 
 // runsAtLanguageLevel returns how many runs to perform on language level.
@@ -189,8 +191,8 @@ func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePer
 				for _, model := range ctx.Models {
 					modelID := model.ID()
 
-					if !modelSucceededBasicChecksOfLanguage[model][language] {
-						log.Printf("Excluding model %q for language %q cause it did not succeed basic checks", model.ID(), language.ID())
+					if !ctx.NoDisqualification && !modelSucceededBasicChecksOfLanguage[model][language] {
+						ctx.Log.Printf("Excluding model %q for language %q cause it did not succeed basic checks", model.ID(), language.ID())
 
 						continue
 					}

diff --git a/evaluate/evaluate_test.go b/evaluate/evaluate_test.go
@@ -396,10 +396,12 @@ func TestEvaluate(t *testing.T) {
 					mockedModel: map[language.Language]map[string]metrics.Assessments{
 						languageGolang: map[string]metrics.Assessments{
 							repositoryPlainPath: map[metrics.AssessmentKey]uint64{
+								metrics.AssessmentKeyCoverage:        0,
 								metrics.AssessmentKeyFilesExecuted:   2,
 								metrics.AssessmentKeyResponseNoError: 2,
 							},
 							repositoryNextPath: map[metrics.AssessmentKey]uint64{
+								metrics.AssessmentKeyCoverage:        0,
 								metrics.AssessmentKeyFilesExecuted:   1,
 								metrics.AssessmentKeyResponseNoError: 1,
 							},
@@ -459,10 +461,12 @@ func TestEvaluate(t *testing.T) {
 					mockedModel: map[language.Language]map[string]metrics.Assessments{
 						languageGolang: map[string]metrics.Assessments{
 							repositoryPlainPath: map[metrics.AssessmentKey]uint64{
+								metrics.AssessmentKeyCoverage:        0,
 								metrics.AssessmentKeyFilesExecuted:   1,
 								metrics.AssessmentKeyResponseNoError: 1,
 							},
 							repositoryNextPath: map[metrics.AssessmentKey]uint64{
+								metrics.AssessmentKeyCoverage:        0,
 								metrics.AssessmentKeyFilesExecuted:   2,
 								metrics.AssessmentKeyResponseNoError: 2,
 							},
@@ -566,6 +570,7 @@ func TestEvaluate(t *testing.T) {
 					mockedModel: map[language.Language]map[string]metrics.Assessments{
 						languageGolang: map[string]metrics.Assessments{
 							repositoryPath: map[metrics.AssessmentKey]uint64{
+								metrics.AssessmentKeyCoverage:        0,
 								metrics.AssessmentKeyFilesExecuted:   3,
 								metrics.AssessmentKeyResponseNoError: 3,
 							},
@@ -617,6 +622,7 @@ func TestEvaluate(t *testing.T) {
 					mockedModel: map[language.Language]map[string]metrics.Assessments{
 						languageGolang: map[string]metrics.Assessments{
 							repositoryPath: map[metrics.AssessmentKey]uint64{
+								metrics.AssessmentKeyCoverage:        0,
 								metrics.AssessmentKeyFilesExecuted:   3,
 								metrics.AssessmentKeyResponseNoError: 3,
 							},
@@ -698,6 +704,7 @@ func TestEvaluate(t *testing.T) {
 					mockedModel: map[language.Language]map[string]metrics.Assessments{
 						languageGolang: map[string]metrics.Assessments{
 							repositoryPath: map[metrics.AssessmentKey]uint64{
+								metrics.AssessmentKeyCoverage:        0,
 								metrics.AssessmentKeyFilesExecuted:   3,
 								metrics.AssessmentKeyResponseNoError: 3,
 							},
@@ -764,6 +771,7 @@ func TestEvaluate(t *testing.T) {
 					mockedModel: map[language.Language]map[string]metrics.Assessments{
 						languageGolang: map[string]metrics.Assessments{
 							repositoryPath: map[metrics.AssessmentKey]uint64{
+								metrics.AssessmentKeyCoverage:        0,
 								metrics.AssessmentKeyFilesExecuted:   3,
 								metrics.AssessmentKeyResponseNoError: 3,
 							},

diff --git a/evaluate/metrics/assessment.go b/evaluate/metrics/assessment.go
@@ -16,19 +16,19 @@ var (
 	// AllAssessmentKeysStrings returns all registered assessment keys as strings.
 	AllAssessmentKeysStrings []string
 
-	// pointsPerAssessment holds the points awarded for a specific assessment.
-	pointsPerAssessment = map[AssessmentKey]uint64{}
+	// multiplierPerAssessment holds the multipliers awarded for a specific assessment.
+	multiplierPerAssessment = map[AssessmentKey]uint64{}
 )
 
 // RegisterAssessmentKey registers a new assessment key.
-// If the points for this assessment type are zero, it is ignored for the score computation.
-func RegisterAssessmentKey(key string, points uint64) AssessmentKey {
+// If the multiplier for this assessment type is zero, it is ignored for the score computation.
+func RegisterAssessmentKey(key string, multiplier uint64) AssessmentKey {
 	assessment := AssessmentKey(key)
 	i := sort.SearchStrings(AllAssessmentKeysStrings, key)
 
 	allAssessmentKeys = slices.Insert(allAssessmentKeys, i, assessment)
 	AllAssessmentKeysStrings = slices.Insert(AllAssessmentKeysStrings, i, key)
-	pointsPerAssessment[assessment] = points
+	multiplierPerAssessment[assessment] = multiplier
 
 	return assessment
 }
@@ -39,8 +39,8 @@ var (
 	// AssessmentKeyProcessingTime holds the time in milliseconds that it took to complete the task.
 	AssessmentKeyProcessingTime = RegisterAssessmentKey("processing-time", 0)
 
-	// AssessmentKeyCoverageStatement counts the cases where 100% coverage was reached.
-	AssessmentKeyCoverageStatement = RegisterAssessmentKey("coverage-statement", 10)
+	// AssessmentKeyCoverage counts execution coverage objects.
+	AssessmentKeyCoverage = RegisterAssessmentKey("coverage", 10)
 
 	// AssessmentKeyResponseCharacterCount counts the number of characters of a response.
 	AssessmentKeyResponseCharacterCount = RegisterAssessmentKey("response-character-count", 0)
@@ -106,7 +106,7 @@ func (a Assessments) Score() (score uint64) {
 	}
 
 	for key, value := range a {
-		if pointsPerAssessment[key] != 0 {
+		if multiplierPerAssessment[key] != 0 {
 			score += value
 		}
 	}
@@ -116,7 +116,12 @@ func (a Assessments) Score() (score uint64) {
 
 // Award yields the score points defined for the given key.
 func (a Assessments) Award(key AssessmentKey) {
-	a[key] += pointsPerAssessment[key]
+	a[key] += multiplierPerAssessment[key]
+}
+
+// AwardPoints yields multiple score points defined for the given key.
+func (a Assessments) AwardPoints(key AssessmentKey, count uint64) {
+	a[key] += multiplierPerAssessment[key] * count
 }
 
 // String returns a string representation of the metrics.

diff --git a/evaluate/metrics/assessment_test.go b/evaluate/metrics/assessment_test.go
@@ -137,7 +137,7 @@ func TestAssessmentString(t *testing.T) {
 
 		Assessment: NewAssessments(),
 
-		ExpectedString: "score=0, coverage-statement=0, files-executed=0, generate-tests-for-file-character-count=0, processing-time=0, response-character-count=0, response-no-error=0, response-no-excess=0, response-with-code=0",
+		ExpectedString: "score=0, coverage=0, files-executed=0, generate-tests-for-file-character-count=0, processing-time=0, response-character-count=0, response-no-error=0, response-no-excess=0, response-with-code=0",
 	})
 
 	validate(t, &testCase{
@@ -146,15 +146,15 @@ func TestAssessmentString(t *testing.T) {
 		Assessment: Assessments{
 			AssessmentKeyGenerateTestsForFileCharacterCount: 50,
 			AssessmentKeyResponseCharacterCount:             100,
-			AssessmentKeyCoverageStatement:                  1,
+			AssessmentKeyCoverage:                           1,
 			AssessmentKeyFilesExecuted:                      2,
 			AssessmentKeyResponseNoError:                    3,
 			AssessmentKeyResponseNoExcess:                   4,
 			AssessmentKeyResponseWithCode:                   5,
 			AssessmentKeyProcessingTime:                     200,
 		},
 
-		ExpectedString: "score=15, coverage-statement=1, files-executed=2, generate-tests-for-file-character-count=50, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5",
+		ExpectedString: "score=15, coverage=1, files-executed=2, generate-tests-for-file-character-count=50, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5",
 	})
 }
 
@@ -264,9 +264,9 @@ func TestAssessmentsScore(t *testing.T) {
 		Name: "Values Assessment",
 
 		Assessments: Assessments{
-			AssessmentKeyFilesExecuted:     5,
-			AssessmentKeyCoverageStatement: 4,
-			AssessmentKeyProcessingTime:    200,
+			AssessmentKeyFilesExecuted:  5,
+			AssessmentKeyCoverage:       4,
+			AssessmentKeyProcessingTime: 200,
 		},
 
 		ExpectedScore: uint64(9),

diff --git a/evaluate/metrics/category.go b/evaluate/metrics/category.go
@@ -82,15 +82,15 @@ func (a Assessments) Category(totalTasks uint64) *AssessmentCategory {
 	}
 
 	switch {
-	case a[AssessmentKeyResponseNoError] != totalTasks*pointsPerAssessment[AssessmentKeyResponseNoError]:
+	case a[AssessmentKeyResponseNoError] != totalTasks*multiplierPerAssessment[AssessmentKeyResponseNoError]:
 		return AssessmentCategoryResponseError
-	case a[AssessmentKeyResponseWithCode] != totalTasks*pointsPerAssessment[AssessmentKeyResponseWithCode] && a[AssessmentKeyFilesExecuted] != totalTasks*pointsPerAssessment[AssessmentKeyFilesExecuted]: // TODO We cannot always detect yet if a model response contains source code, so ensure we don't categorize into "no code" if the code actually ran successfully all the time. https://github.com/symflower/eval-dev-quality/issues/43
+	case a[AssessmentKeyResponseWithCode] != totalTasks*multiplierPerAssessment[AssessmentKeyResponseWithCode] && a[AssessmentKeyFilesExecuted] != totalTasks*multiplierPerAssessment[AssessmentKeyFilesExecuted]: // TODO We cannot always detect yet if a model response contains source code, so ensure we don't categorize into "no code" if the code actually ran successfully all the time. https://github.com/symflower/eval-dev-quality/issues/43
 		return AssessmentCategoryResponseNoCode
-	case a[AssessmentKeyFilesExecuted] != totalTasks*pointsPerAssessment[AssessmentKeyFilesExecuted]:
+	case a[AssessmentKeyFilesExecuted] != totalTasks*multiplierPerAssessment[AssessmentKeyFilesExecuted]:
 		return AssessmentCategoryCodeInvalid
-	case a[AssessmentKeyCoverageStatement] != totalTasks*pointsPerAssessment[AssessmentKeyCoverageStatement]:
+	case a[AssessmentKeyCoverage] != totalTasks*multiplierPerAssessment[AssessmentKeyCoverage]:
 		return AssessmentCategoryCodeExecuted
-	case a[AssessmentKeyResponseNoExcess] != totalTasks*pointsPerAssessment[AssessmentKeyResponseNoExcess]:
+	case a[AssessmentKeyResponseNoExcess] != totalTasks*multiplierPerAssessment[AssessmentKeyResponseNoExcess]:
 		return AssessmentCategoryCodeCoverageStatementReached
 	default:
 		return AssessmentCategoryCodeNoExcess

diff --git a/evaluate/metrics/category_test.go b/evaluate/metrics/category_test.go
@@ -94,10 +94,10 @@ func TestAssessmentsCategory(t *testing.T) {
 		Name: "Full Statement Coverage",
 
 		Assessments: Assessments{
-			AssessmentKeyResponseNoError:   1,
-			AssessmentKeyResponseWithCode:  1,
-			AssessmentKeyFilesExecuted:     1,
-			AssessmentKeyCoverageStatement: 10,
+			AssessmentKeyResponseNoError:  1,
+			AssessmentKeyResponseWithCode: 1,
+			AssessmentKeyFilesExecuted:    1,
+			AssessmentKeyCoverage:         10,
 		},
 		Total: 1,
 
@@ -108,11 +108,11 @@ func TestAssessmentsCategory(t *testing.T) {
 		Name: "No Excess",
 
 		Assessments: Assessments{
-			AssessmentKeyResponseNoError:   1,
-			AssessmentKeyResponseWithCode:  1,
-			AssessmentKeyFilesExecuted:     1,
-			AssessmentKeyCoverageStatement: 10,
-			AssessmentKeyResponseNoExcess:  1,
+			AssessmentKeyResponseNoError:  1,
+			AssessmentKeyResponseWithCode: 1,
+			AssessmentKeyFilesExecuted:    1,
+			AssessmentKeyCoverage:         10,
+			AssessmentKeyResponseNoExcess: 1,
 		},
 		Total: 1,
 
@@ -123,11 +123,11 @@ func TestAssessmentsCategory(t *testing.T) {
 		Name: "Inconsistent",
 
 		Assessments: Assessments{
-			AssessmentKeyResponseNoError:   2,
-			AssessmentKeyResponseWithCode:  2,
-			AssessmentKeyFilesExecuted:     2,
-			AssessmentKeyCoverageStatement: 1,
-			AssessmentKeyResponseNoExcess:  0,
+			AssessmentKeyResponseNoError:  2,
+			AssessmentKeyResponseWithCode: 2,
+			AssessmentKeyFilesExecuted:    2,
+			AssessmentKeyCoverage:         1,
+			AssessmentKeyResponseNoExcess: 0,
 		},
 		Total: 2,