Skip to content

Commit f840de8

Browse files
LoCoBench Botclaude
andcommitted
fix: nlqa-flow-003 ground truth schema + sec-cve-003 verifier rewrite
- nlqa-flow-003: Convert `pattern` (string) to `patterns` (array) and add missing `weight` field on all 21 ground truth items. Without this, the weighted_checklist scorer crashes with KeyError at runtime. - sec-cve-003: Replace custom bash scorer with standard Python-based weighted_checklist template. Old scorer wrote to /logs/result.json (wrong path), used exit 1 on missing output (verifier crash), and used non-standard ground truth fields (value/path/function/code/event). New scorer writes to /logs/verifier/reward.txt, exits 0 on all paths, and ground truth converted to standard patterns/weight schema. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c5a36ed commit f840de8

File tree

3 files changed

+254
-264
lines changed

3 files changed

+254
-264
lines changed

benchmarks/ccb_nlqa/nlqa-flow-003/tests/ground_truth.json

Lines changed: 63 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -7,94 +7,115 @@
77
},
88
"required_findings": [
99
{
10-
"pattern": "(?i)(ApplicationController|appcontroller\\.go).*reconcil",
11-
"description": "ApplicationController triggers reconciliation cycle"
10+
"patterns": ["(?i)(ApplicationController|appcontroller\\.go).*reconcil"],
11+
"description": "ApplicationController triggers reconciliation cycle",
12+
"weight": 1.0
1213
},
1314
{
14-
"pattern": "(?i)(GenerateManifests|repository\\.go).*manifest",
15-
"description": "RepoServer GenerateManifests() renders manifests from Git"
15+
"patterns": ["(?i)(GenerateManifests|repository\\.go).*manifest"],
16+
"description": "RepoServer GenerateManifests() renders manifests from Git",
17+
"weight": 1.0
1618
},
1719
{
18-
"pattern": "(?i)(ManifestRequest|ManifestResponse)",
19-
"description": "ManifestRequest/Response data structures for controller-reposerver communication"
20+
"patterns": ["(?i)(ManifestRequest|ManifestResponse)"],
21+
"description": "ManifestRequest/Response data structures for controller-reposerver communication",
22+
"weight": 1.0
2023
},
2124
{
22-
"pattern": "(?i)(diff\\.go|Diff\\(\\)|StateDiff|Normalize)",
23-
"description": "Diff engine computes difference between desired and live state"
25+
"patterns": ["(?i)(diff\\.go|Diff\\(\\)|StateDiff|Normalize)"],
26+
"description": "Diff engine computes difference between desired and live state",
27+
"weight": 1.0
2428
},
2529
{
26-
"pattern": "(?i)(DiffResult|ResourceDiff)",
27-
"description": "DiffResult data structure represents out-of-sync resources"
30+
"patterns": ["(?i)(DiffResult|ResourceDiff)"],
31+
"description": "DiffResult data structure represents out-of-sync resources",
32+
"weight": 1.0
2833
},
2934
{
30-
"pattern": "(?i)(3-way diff|last-applied-configuration)",
31-
"description": "3-way diff algorithm compares live, desired, and last-applied state"
35+
"patterns": ["(?i)(3-way diff|last-applied-configuration)"],
36+
"description": "3-way diff algorithm compares live, desired, and last-applied state",
37+
"weight": 1.0
3238
},
3339
{
34-
"pattern": "(?i)(sync.*phase|PreSync|PostSync|wave)",
35-
"description": "Sync phases (PreSync, Sync, PostSync) and waves orchestrate resource application"
40+
"patterns": ["(?i)(sync.*phase|PreSync|PostSync|wave)"],
41+
"description": "Sync phases (PreSync, Sync, PostSync) and waves orchestrate resource application",
42+
"weight": 1.0
3643
},
3744
{
38-
"pattern": "(?i)(kubectl apply|server-side apply|SSA)",
39-
"description": "Sync strategy: kubectl apply (client-side) or server-side apply"
45+
"patterns": ["(?i)(kubectl apply|server-side apply|SSA)"],
46+
"description": "Sync strategy: kubectl apply (client-side) or server-side apply",
47+
"weight": 1.0
4048
}
4149
],
4250
"file_references": [
4351
{
44-
"pattern": "controller/appcontroller\\.go",
45-
"description": "ApplicationController reconciliation loop"
52+
"patterns": ["controller/appcontroller\\.go"],
53+
"description": "ApplicationController reconciliation loop",
54+
"weight": 1.0
4655
},
4756
{
48-
"pattern": "reposerver/repository/repository\\.go",
49-
"description": "RepoServer manifest generation"
57+
"patterns": ["reposerver/repository/repository\\.go"],
58+
"description": "RepoServer manifest generation",
59+
"weight": 1.0
5060
},
5161
{
52-
"pattern": "util/diff/diff\\.go",
53-
"description": "Diff engine for state comparison"
62+
"patterns": ["util/diff/diff\\.go"],
63+
"description": "Diff engine for state comparison",
64+
"weight": 1.0
5465
},
5566
{
56-
"pattern": "pkg/apis/application/v1alpha1/",
57-
"description": "Application CRD definitions (SyncPolicy, SyncOperation, ApplicationStatus)"
67+
"patterns": ["pkg/apis/application/v1alpha1/"],
68+
"description": "Application CRD definitions (SyncPolicy, SyncOperation, ApplicationStatus)",
69+
"weight": 1.0
5870
},
5971
{
60-
"pattern": "util/argo/",
61-
"description": "Sync operation utilities"
72+
"patterns": ["util/argo/"],
73+
"description": "Sync operation utilities",
74+
"weight": 1.0
6275
},
6376
{
64-
"pattern": "cmd/argocd-application-controller/",
65-
"description": "Application controller command entry point"
77+
"patterns": ["cmd/argocd-application-controller/"],
78+
"description": "Application controller command entry point",
79+
"weight": 1.0
6680
}
6781
],
6882
"causal_chain": [
6983
{
70-
"pattern": "(?i)(git.*clone|git.*fetch|revision|commit.*SHA)",
71-
"description": "Step 1: Git repository fetch at specified revision"
84+
"patterns": ["(?i)(git.*clone|git.*fetch|revision|commit.*SHA)"],
85+
"description": "Step 1: Git repository fetch at specified revision",
86+
"weight": 1.0
7287
},
7388
{
74-
"pattern": "(?i)(Helm|Kustomize|CMP|plugin).*render",
75-
"description": "Step 2: Config management tool renders raw manifests"
89+
"patterns": ["(?i)(Helm|Kustomize|CMP|plugin).*render"],
90+
"description": "Step 2: Config management tool renders raw manifests",
91+
"weight": 1.0
7692
},
7793
{
78-
"pattern": "(?i)(normalize|strip.*metadata|ignore.*difference)",
79-
"description": "Step 3: Resource normalization before comparison"
94+
"patterns": ["(?i)(normalize|strip.*metadata|ignore.*difference)"],
95+
"description": "Step 3: Resource normalization before comparison",
96+
"weight": 1.0
8097
},
8198
{
82-
"pattern": "(?i)(diff.*result|out-of-sync|created|modified|deleted)",
83-
"description": "Step 4: Diff computation produces list of changes"
99+
"patterns": ["(?i)(diff.*result|out-of-sync|created|modified|deleted)"],
100+
"description": "Step 4: Diff computation produces list of changes",
101+
"weight": 1.0
84102
},
85103
{
86-
"pattern": "(?i)(apply|sync.*operation|cluster.*update)",
87-
"description": "Step 5: Sync operation applies changes to Kubernetes cluster"
104+
"patterns": ["(?i)(apply|sync.*operation|cluster.*update)"],
105+
"description": "Step 5: Sync operation applies changes to Kubernetes cluster",
106+
"weight": 1.0
88107
}
89108
],
90109
"negative_checks": [
91110
{
92-
"pattern": "(?i)(Flux|Spinnaker|Jenkins)",
93-
"description": "Should NOT mention other CD tools (Flux, Spinnaker, Jenkins) — Argo CD has unique architecture"
111+
"patterns": ["(?i)(Flux|Spinnaker|Jenkins)"],
112+
"description": "Should NOT mention other CD tools (Flux, Spinnaker, Jenkins) — Argo CD has unique architecture",
113+
"weight": 1.0
94114
},
95115
{
96-
"pattern": "(?i)(push.*git|write.*git|git.*commit.*cluster)",
97-
"description": "Should NOT claim Argo CD writes cluster state back to Git — it's pull-based, not push-based"
116+
"patterns": ["(?i)(push.*git|write.*git|git.*commit.*cluster)"],
117+
"description": "Should NOT claim Argo CD writes cluster state back to Git — it's pull-based, not push-based",
118+
"weight": 1.0
98119
}
99120
]
100121
}

benchmarks/ccb_security/sec-cve-003/tests/ground_truth.json

Lines changed: 71 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -1,167 +1,128 @@
11
{
2-
"task_id": "sec-cve-003",
3-
"cve_id": "CVE-2023-39325",
4-
"vulnerability_type": "uncontrolled_resource_consumption",
5-
"severity": "high",
6-
"cvss_score": 7.5,
7-
2+
"weights": {
3+
"required_findings": 0.40,
4+
"file_references": 0.20,
5+
"causal_chain": 0.20,
6+
"negative_checks": 0.10
7+
},
88
"required_findings": [
99
{
10-
"type": "vulnerable_function",
11-
"value": "processHeaders",
12-
"file": "http2/server.go",
13-
"weight": 0.20,
14-
"description": "The processHeaders function directly spawns handler goroutines without checking limits"
10+
"patterns": ["(?i)processHeaders"],
11+
"description": "The processHeaders function directly spawns handler goroutines without checking limits",
12+
"weight": 2.0
1513
},
1614
{
17-
"type": "vulnerable_pattern",
18-
"value": "go sc.runHandler",
19-
"context": "unbounded goroutine creation",
20-
"weight": 0.15,
21-
"description": "Direct goroutine spawning without tracking or limiting active handlers"
15+
"patterns": ["(?i)go\\s+sc\\.runHandler", "(?i)runHandler"],
16+
"description": "Direct goroutine spawning without tracking or limiting active handlers",
17+
"weight": 1.5
2218
},
2319
{
24-
"type": "missing_check",
25-
"value": "handler_goroutine_limit",
26-
"context": "no enforcement of MaxConcurrentStreams on handler goroutines",
27-
"weight": 0.15,
28-
"description": "The vulnerability exists because handler goroutines are not counted or limited"
20+
"patterns": ["(?i)(handler.*goroutine.*limit|MaxConcurrentStreams.*handler|limit.*handler)"],
21+
"description": "The vulnerability exists because handler goroutines are not counted or limited",
22+
"weight": 1.5
2923
},
3024
{
31-
"type": "vulnerability_type",
32-
"value": "denial_of_service",
33-
"context": "resource exhaustion via rapid stream reset",
34-
"weight": 0.10,
35-
"description": "HTTP/2 rapid reset attack causing unbounded goroutine accumulation"
25+
"patterns": ["(?i)(denial.of.service|DoS|resource.exhaustion)"],
26+
"description": "HTTP/2 rapid reset attack causing unbounded goroutine accumulation",
27+
"weight": 1.0
3628
},
3729
{
38-
"type": "attack_vector",
39-
"value": "rst_stream",
40-
"context": "RST_STREAM frees stream slots while handlers keep running",
41-
"weight": 0.10,
42-
"description": "Attacker rapidly creates streams and resets them to spawn unbounded handlers"
30+
"patterns": ["(?i)(RST_STREAM|rapid.reset|rst.stream)"],
31+
"description": "Attacker rapidly creates streams and resets them to spawn unbounded handlers",
32+
"weight": 1.0
4333
},
4434
{
45-
"type": "affected_component",
46-
"value": "http2_server",
47-
"context": "golang.org/x/net/http2 package",
48-
"weight": 0.10,
49-
"description": "The http2 server implementation in x/net"
35+
"patterns": ["(?i)(http2.*server|x/net/http2|golang.org/x/net)"],
36+
"description": "The http2 server implementation in x/net",
37+
"weight": 1.0
5038
}
5139
],
52-
5340
"file_references": [
5441
{
55-
"path": "http2/server.go",
56-
"relevance": "primary",
57-
"weight": 0.10,
58-
"description": "Contains the vulnerable processHeaders and runHandler functions"
42+
"patterns": ["http2/server\\.go"],
43+
"description": "Contains the vulnerable processHeaders and runHandler functions",
44+
"weight": 2.0
5945
},
6046
{
61-
"path": "http2/frame.go",
62-
"relevance": "supporting",
63-
"weight": 0.05,
64-
"description": "Defines RST_STREAM frame handling used in the attack"
47+
"patterns": ["http2/frame\\.go"],
48+
"description": "Defines RST_STREAM frame handling used in the attack",
49+
"weight": 1.0
6550
},
6651
{
67-
"path": "http2/transport.go",
68-
"relevance": "supporting",
69-
"weight": 0.05,
70-
"description": "HTTP/2 stream management, relevant for understanding stream lifecycle"
52+
"patterns": ["http2/transport\\.go"],
53+
"description": "HTTP/2 stream management, relevant for understanding stream lifecycle",
54+
"weight": 1.0
7155
}
7256
],
73-
7457
"causal_chain": [
7558
{
76-
"step": 1,
77-
"function": "serve",
78-
"file": "http2/server.go",
79-
"description": "Main server loop processes incoming HTTP/2 frames"
59+
"patterns": ["(?i)(serve|server.*loop|incoming.*frame)"],
60+
"description": "Step 1: Main server loop processes incoming HTTP/2 frames",
61+
"weight": 1.0
8062
},
8163
{
82-
"step": 2,
83-
"function": "processHeaders",
84-
"file": "http2/server.go",
85-
"description": "Processes HEADERS frame and spawns handler goroutine"
64+
"patterns": ["(?i)(processHeaders|HEADERS.*frame)"],
65+
"description": "Step 2: Processes HEADERS frame and spawns handler goroutine",
66+
"weight": 1.0
8667
},
8768
{
88-
"step": 3,
89-
"code": "go sc.runHandler(rw, req, handler)",
90-
"file": "http2/server.go",
91-
"description": "VULNERABLE: Spawns handler goroutine without checking limit"
69+
"patterns": ["(?i)(runHandler|spawn.*goroutine|go\\s+sc)"],
70+
"description": "Step 3: Spawns handler goroutine without checking limit (VULNERABLE)",
71+
"weight": 1.0
9272
},
9373
{
94-
"step": 4,
95-
"event": "RST_STREAM received",
96-
"description": "Attacker sends RST_STREAM to free stream slot"
74+
"patterns": ["(?i)(RST_STREAM|reset.*stream|stream.*reset)"],
75+
"description": "Step 4: Attacker sends RST_STREAM to free stream slot",
76+
"weight": 1.0
9777
},
9878
{
99-
"step": 5,
100-
"event": "stream slot freed",
101-
"description": "Stream slot becomes available, but handler goroutine continues running"
79+
"patterns": ["(?i)(stream.*slot.*free|handler.*continue|goroutine.*accumul)"],
80+
"description": "Step 5: Stream slot freed but handler goroutine continues running",
81+
"weight": 1.0
10282
},
10383
{
104-
"step": 6,
105-
"event": "rapid repetition",
106-
"description": "Attacker repeats steps 2-5 rapidly to accumulate handler goroutines"
107-
},
108-
{
109-
"step": 7,
110-
"impact": "resource exhaustion",
111-
"description": "Unbounded handler goroutines consume CPU and memory, causing DoS"
84+
"patterns": ["(?i)(rapid.*repeat|unbounded|exhaust|CPU|memory|DoS)"],
85+
"description": "Step 6-7: Rapid repetition causes unbounded goroutine accumulation and resource exhaustion",
86+
"weight": 1.0
11287
}
11388
],
114-
11589
"negative_checks": [
11690
{
117-
"type": "wrong_component",
118-
"value": "TLS layer",
119-
"weight": 0.05,
120-
"description": "The vulnerability is NOT in TLS, it's in HTTP/2 stream handling"
91+
"patterns": ["(?i)\\bTLS\\b.*\\b(layer|vulnerab|bug|flaw)"],
92+
"description": "The vulnerability is NOT in TLS, it's in HTTP/2 stream handling",
93+
"weight": 1.0
12194
},
12295
{
123-
"type": "wrong_component",
124-
"value": "HTTP/1.1",
125-
"weight": 0.05,
126-
"description": "The vulnerability is specific to HTTP/2, not HTTP/1.1"
96+
"patterns": ["(?i)\\bHTTP/1\\.1\\b.*\\b(vulnerab|affect|bug|flaw)"],
97+
"description": "The vulnerability is specific to HTTP/2, not HTTP/1.1",
98+
"weight": 1.0
12799
},
128100
{
129-
"type": "wrong_vulnerability_type",
130-
"value": "buffer overflow",
131-
"weight": 0.05,
132-
"description": "This is NOT a memory corruption bug, it's resource exhaustion"
101+
"patterns": ["(?i)\\bbuffer\\s+overflow\\b"],
102+
"description": "This is NOT a memory corruption bug, it's resource exhaustion",
103+
"weight": 1.0
133104
},
134105
{
135-
"type": "wrong_attack_vector",
136-
"value": "malformed headers",
137-
"weight": 0.05,
138-
"description": "Attack uses valid HTTP/2 frames (RST_STREAM), not malformed input"
106+
"patterns": ["(?i)\\bmalformed\\s+headers?\\b.*\\b(attack|exploit|vuln)"],
107+
"description": "Attack uses valid HTTP/2 frames (RST_STREAM), not malformed input",
108+
"weight": 1.0
139109
},
140110
{
141-
"type": "wrong_fix",
142-
"value": "input validation",
143-
"weight": 0.05,
144-
"description": "Fix is not about validating input, but limiting concurrent handler goroutines"
111+
"patterns": ["(?i)\\binput\\s+validation\\b.*\\bfix\\b"],
112+
"description": "Fix is not about validating input, but limiting concurrent handler goroutines",
113+
"weight": 1.0
145114
}
146115
],
147-
148-
"remediation_keywords": [
149-
"limit handler goroutines",
150-
"enforce MaxConcurrentStreams on handlers",
151-
"queue deferred handlers",
152-
"track active handler count",
153-
"backpressure mechanism",
154-
"scheduleHandler function",
155-
"curHandlers counter",
156-
"unstartedHandlers queue"
157-
],
158-
159116
"metadata": {
117+
"task_id": "sec-cve-003",
118+
"cve_id": "CVE-2023-39325",
119+
"vulnerability_type": "uncontrolled_resource_consumption",
120+
"severity": "high",
121+
"cvss_score": 7.5,
160122
"fix_commit": "b225e7ca6dde1ef5a5ae5ce922861bda011cfabd",
161123
"vulnerable_commit": "88194ad8ab44a02ea952c169883c3f57db6cf9f4",
162124
"fix_version": "v0.17.0",
163125
"vulnerable_version": "v0.16.0",
164-
"disclosure_date": "2023-10-10",
165126
"also_known_as": "CVE-2023-44487",
166127
"attack_name": "HTTP/2 Rapid Reset"
167128
}

0 commit comments

Comments
 (0)