Skip to content

Commit 666e774

Browse files
committed
Drop project_kb_msr2019 V1 importer
Fix CI falling test Resolve merge conflict and Update migration file Remove duplication and create append_patch_classifications function Update the project-kb test Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent 80b43fe commit 666e774

File tree

17 files changed

+2780
-25578
lines changed

17 files changed

+2780
-25578
lines changed

vulnerabilities/importers/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
from vulnerabilities.importers import openssl
2525
from vulnerabilities.importers import oss_fuzz
2626
from vulnerabilities.importers import postgresql
27-
from vulnerabilities.importers import project_kb_msr2019
2827
from vulnerabilities.importers import redhat
2928
from vulnerabilities.importers import retiredotnet
3029
from vulnerabilities.importers import ruby
@@ -112,7 +111,6 @@
112111
mozilla.MozillaImporter,
113112
gentoo.GentooImporter,
114113
istio.IstioImporter,
115-
project_kb_msr2019.ProjectKBMSRImporter,
116114
suse_scores.SUSESeverityScoreImporter,
117115
elixir_security.ElixirSecurityImporter,
118116
xen.XenImporter,

vulnerabilities/importers/project_kb_msr2019.py

Lines changed: 0 additions & 46 deletions
This file was deleted.

vulnerabilities/improvers/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@
2020
from vulnerabilities.pipelines import populate_vulnerability_summary_pipeline
2121
from vulnerabilities.pipelines import remove_duplicate_advisories
2222
from vulnerabilities.pipelines.v2_improvers import collect_ssvc_trees
23-
from vulnerabilities.pipelines.v2_improvers import (
24-
collect_commits_project_kb as collect_commits_project_kb_v2,
25-
)
2623
from vulnerabilities.pipelines.v2_improvers import compute_advisory_todo as compute_advisory_todo_v2
2724
from vulnerabilities.pipelines.v2_improvers import compute_package_risk as compute_package_risk_v2
2825
from vulnerabilities.pipelines.v2_improvers import (
@@ -75,6 +72,5 @@
7572
unfurl_version_range_v2.UnfurlVersionRangePipeline,
7673
compute_advisory_todo.ComputeToDo,
7774
collect_ssvc_trees.CollectSSVCPipeline,
78-
collect_commits_project_kb_v2.CollectFixCommitsProjectKBPipeline,
7975
]
8076
)

vulnerabilities/pipelines/v2_importers/aosp_importer.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,9 @@
1616
from fetchcode.vcs import fetch_via_vcs
1717

1818
from vulnerabilities.importer import AdvisoryData
19-
from vulnerabilities.importer import AffectedPackageV2
20-
from vulnerabilities.importer import PackageCommitPatchData
21-
from vulnerabilities.importer import PatchData
22-
from vulnerabilities.importer import ReferenceV2
2319
from vulnerabilities.importer import VulnerabilitySeverity
2420
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2
25-
from vulnerabilities.pipes.advisory import classify_patch_source
21+
from vulnerabilities.pipes.advisory import append_patch_classifications
2622
from vulnerabilities.severity_systems import GENERIC
2723

2824

@@ -90,23 +86,14 @@ def collect_advisories(self):
9086
patch_url = commit_data.get("patchUrl")
9187
commit_id = commit_data.get("commitId")
9288

93-
base_purl, patch_objs = classify_patch_source(
89+
append_patch_classifications(
9490
url=patch_url,
9591
commit_hash=commit_id,
9692
patch_text=None,
93+
affected_packages=affected_packages,
94+
references=references,
95+
patches=patches,
9796
)
98-
for patch_obj in patch_objs:
99-
if isinstance(patch_obj, PackageCommitPatchData):
100-
fixed_commit = patch_obj
101-
affected_package = AffectedPackageV2(
102-
package=base_purl,
103-
fixed_by_commit_patches=[fixed_commit],
104-
)
105-
affected_packages.append(affected_package)
106-
elif isinstance(patch_obj, PatchData):
107-
patches.append(patch_obj)
108-
elif isinstance(patch_obj, ReferenceV2):
109-
references.append(patch_obj)
11097

11198
url = (
11299
"https://raw.githubusercontent.com/quarkslab/aosp_dataset/refs/heads/master/cves/"

vulnerabilities/pipelines/v2_importers/project_kb_importer.py

Lines changed: 96 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,51 +7,66 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10-
import json
10+
import csv
1111
from pathlib import Path
1212
from typing import Iterable
1313

1414
import saneyaml
1515
from fetchcode.vcs import fetch_via_vcs
1616
from packageurl import PackageURL
17-
from univers.maven import VersionRange
17+
from univers.version_range import RANGE_CLASS_BY_SCHEMES
18+
from univers.versions import InvalidVersion
1819

1920
from vulnerabilities.importer import AdvisoryData
2021
from vulnerabilities.importer import AffectedPackageV2
21-
from vulnerabilities.importer import ReferenceV2
2222
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2
23+
from vulnerabilities.pipes.advisory import append_patch_classifications
2324
from vulnerabilities.utils import get_advisory_url
25+
from vulnerabilities.utils import is_commit
2426

2527

2628
class ProjectKBPipeline(VulnerableCodeBaseImporterPipelineV2):
2729
"""
2830
ProjectKB Importer Pipeline
2931
Collect advisory from ProjectKB data:
3032
- YAML statements: https://github.com/SAP/project-kb/blob/vulnerability-data/statements/*/*.yaml
33+
- CSV database https://github.com/SAP/project-kb/blob/main/MSR2019/dataset/vulas_db_msr2019_release.csv
3134
"""
3235

3336
pipeline_id = "project-kb_v2"
3437
spdx_license_expression = "Apache-2.0"
3538
license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt"
36-
repo_url = "git+https://github.com/SAP/project-kb@vulnerability-data"
39+
main_branch = "git+https://github.com/SAP/project-kb"
40+
vuln_data_branch = "git+https://github.com/SAP/project-kb@vulnerability-data"
3741

3842
@classmethod
3943
def steps(cls):
4044
return (cls.clone_repo, cls.collect_and_store_advisories, cls.clean_downloads)
4145

4246
def clone_repo(self):
43-
self.log("Processing ProjectKB advisory data...")
44-
self.vcs_response = fetch_via_vcs(self.repo_url)
47+
self.log("Cloning ProjectKB advisory data...")
48+
self.main_branch_vcs = fetch_via_vcs(self.main_branch)
49+
self.vuln_data_branch_vcs = fetch_via_vcs(self.vuln_data_branch)
4550

4651
def advisories_count(self):
47-
base_path = Path(self.vcs_response.dest_dir) / "statements"
48-
count = sum(1 for _ in base_path.rglob("*.yaml"))
52+
base_path = Path(self.vuln_data_branch_vcs.dest_dir) / "statements"
53+
csv_path = (
54+
Path(self.main_branch_vcs.dest_dir) / "MSR2019/dataset/vulas_db_msr2019_release.csv"
55+
)
56+
57+
count_files = sum(1 for _ in base_path.rglob("*.yaml"))
58+
with open(csv_path, newline="", encoding="utf-8") as f:
59+
reader = csv.reader(f)
60+
next(reader, None)
61+
count_rows = sum(1 for _ in reader)
62+
63+
count = count_files + count_rows
4964
self.log(f"Estimated advisories to process: {count}")
5065
return count
5166

5267
def collect_advisories(self) -> Iterable[AdvisoryData]:
53-
"""Collect fix commits from YAML statements under /statements."""
54-
base_path = Path(self.vcs_response.dest_dir) / "statements"
68+
self.log("Collecting fix commits from YAML statements under /statements....")
69+
base_path = Path(self.vuln_data_branch_vcs.dest_dir) / "statements"
5570

5671
for yaml_file in base_path.rglob("*.yaml"):
5772
if yaml_file.name != "statement.yaml":
@@ -67,23 +82,30 @@ def collect_advisories(self) -> Iterable[AdvisoryData]:
6782
note_texts = []
6883
for note_entry in yaml_data.get("notes", []):
6984
text_content = note_entry.get("text")
70-
if text_content:
71-
note_texts.append(text_content)
85+
if not text_content:
86+
continue
87+
note_texts.append(text_content)
7288
description = "\n".join(note_texts)
7389

7490
references = []
91+
affected_packages = []
92+
patches = []
7593
for fix in yaml_data.get("fixes", []):
7694
for commit in fix.get("commits", []):
77-
commit_id = commit.get("id")
78-
repo_url = commit.get("repository")
79-
if not commit_id or not repo_url:
80-
continue
81-
82-
commit_url = repo_url.replace(".git", "") + "/commit/" + commit_id
83-
ref = ReferenceV2.from_url(commit_url)
84-
references.append(ref)
95+
commit_hash = commit.get("id")
96+
if not is_commit(commit_hash):
97+
commit_hash = None
98+
99+
vcs_url = commit.get("repository")
100+
append_patch_classifications(
101+
url=vcs_url,
102+
commit_hash=commit_hash,
103+
patch_text=None,
104+
affected_packages=affected_packages,
105+
references=references,
106+
patches=patches,
107+
)
85108

86-
affected_packages = []
87109
for artifact in yaml_data.get("artifacts", []):
88110
affected = artifact.get("affected")
89111
if not affected:
@@ -92,9 +114,20 @@ def collect_advisories(self) -> Iterable[AdvisoryData]:
92114
purl_str = artifact.get("id")
93115
purl = PackageURL.from_string(purl_str)
94116

117+
try:
118+
version_range_class = RANGE_CLASS_BY_SCHEMES.get(purl.type)
119+
version_class = (
120+
version_range_class.version_class if version_range_class else None
121+
)
122+
version_range = version_class(purl.version)
123+
except InvalidVersion:
124+
self.log(f"Invalid Version: {purl.version!r} for purl type: {purl.type!r}")
125+
continue
126+
95127
affected_package = AffectedPackageV2(
96128
package=PackageURL(type=purl.type, namespace=purl.namespace, name=purl.name),
97-
fixed_version_range=VersionRange.from_version(purl.version),
129+
fixed_version_range=version_range if not affected else None,
130+
affected_version_range=version_range if affected else None,
98131
)
99132
affected_packages.append(affected_package)
100133

@@ -106,19 +139,55 @@ def collect_advisories(self) -> Iterable[AdvisoryData]:
106139

107140
yield AdvisoryData(
108141
advisory_id=vulnerability_id,
109-
aliases=[],
110-
summary=description or "",
142+
summary=description,
111143
affected_packages=affected_packages,
112144
references_v2=references,
145+
patches=patches,
113146
url=advisory_url,
114-
original_advisory_text=json.dumps(yaml_data, indent=2, ensure_ascii=False),
147+
)
148+
149+
self.log("Collecting fix commits from ProjectKB ( vulas_db_msr2019_release )...")
150+
csv_path = (
151+
Path(self.main_branch_vcs.dest_dir) / "MSR2019/dataset/vulas_db_msr2019_release.csv"
152+
)
153+
154+
with open(csv_path, newline="", encoding="utf-8") as f:
155+
reader = csv.reader(f)
156+
next(reader, None) # skip header
157+
rows = [r for r in reader if len(r) == 4 and r[0]] # vuln_id, vcs_url, commit_hash, poc
158+
159+
for vuln_id, vcs_url, commit_hash, _ in rows:
160+
if not vuln_id or not vcs_url or not commit_hash:
161+
continue
162+
163+
patches = []
164+
affected_packages = []
165+
references = []
166+
append_patch_classifications(
167+
url=vcs_url,
168+
commit_hash=commit_hash,
169+
patch_text=None,
170+
affected_packages=affected_packages,
171+
references=references,
172+
patches=patches,
173+
)
174+
175+
yield AdvisoryData(
176+
advisory_id=vuln_id,
177+
affected_packages=affected_packages,
178+
patches=patches,
179+
references_v2=references,
180+
url="https://github.com/SAP/project-kb/blob/main/MSR2019/dataset/vulas_db_msr2019_release.csv",
115181
)
116182

117183
def clean_downloads(self):
118184
"""Remove the cloned repository from disk."""
119185
self.log("Removing cloned repository...")
120-
if self.vcs_response:
121-
self.vcs_response.delete()
186+
if self.main_branch_vcs:
187+
self.main_branch_vcs.delete()
188+
189+
if self.vuln_data_branch_vcs:
190+
self.vuln_data_branch_vcs.delete()
122191

123192
def on_failure(self):
124193
"""Ensure cleanup happens on pipeline failure."""

0 commit comments

Comments
 (0)