77# See https://aboutcode.org for more information about nexB OSS projects.
88#
99
10- import json
10+ import csv
1111from pathlib import Path
1212from typing import Iterable
1313
1414import saneyaml
1515from fetchcode .vcs import fetch_via_vcs
1616from packageurl import PackageURL
17- from univers .maven import VersionRange
17+ from univers .version_range import RANGE_CLASS_BY_SCHEMES
18+ from univers .versions import InvalidVersion
1819
1920from vulnerabilities .importer import AdvisoryData
2021from vulnerabilities .importer import AffectedPackageV2
21- from vulnerabilities .importer import ReferenceV2
2222from vulnerabilities .pipelines import VulnerableCodeBaseImporterPipelineV2
23+ from vulnerabilities .pipes .advisory import append_patch_classifications
2324from vulnerabilities .utils import get_advisory_url
25+ from vulnerabilities .utils import is_commit
2426
2527
2628class ProjectKBPipeline (VulnerableCodeBaseImporterPipelineV2 ):
2729 """
2830 ProjectKB Importer Pipeline
2931 Collect advisory from ProjectKB data:
3032 - YAML statements: https://github.com/SAP/project-kb/blob/vulnerability-data/statements/*/*.yaml
33+ - CSV database https://github.com/SAP/project-kb/blob/main/MSR2019/dataset/vulas_db_msr2019_release.csv
3134 """
3235
3336 pipeline_id = "project-kb_v2"
3437 spdx_license_expression = "Apache-2.0"
3538 license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt"
36- repo_url = "git+https://github.com/SAP/project-kb@vulnerability-data"
39+ main_branch = "git+https://github.com/SAP/project-kb"
40+ vuln_data_branch = "git+https://github.com/SAP/project-kb@vulnerability-data"
3741
3842 @classmethod
3943 def steps (cls ):
4044 return (cls .clone_repo , cls .collect_and_store_advisories , cls .clean_downloads )
4145
4246 def clone_repo (self ):
43- self .log ("Processing ProjectKB advisory data..." )
44- self .vcs_response = fetch_via_vcs (self .repo_url )
47+ self .log ("Cloning ProjectKB advisory data..." )
48+ self .main_branch_vcs = fetch_via_vcs (self .main_branch )
49+ self .vuln_data_branch_vcs = fetch_via_vcs (self .vuln_data_branch )
4550
4651 def advisories_count (self ):
47- base_path = Path (self .vcs_response .dest_dir ) / "statements"
48- count = sum (1 for _ in base_path .rglob ("*.yaml" ))
52+ base_path = Path (self .vuln_data_branch_vcs .dest_dir ) / "statements"
53+ csv_path = (
54+ Path (self .main_branch_vcs .dest_dir ) / "MSR2019/dataset/vulas_db_msr2019_release.csv"
55+ )
56+
57+ count_files = sum (1 for _ in base_path .rglob ("*.yaml" ))
58+ with open (csv_path , newline = "" , encoding = "utf-8" ) as f :
59+ reader = csv .reader (f )
60+ next (reader , None )
61+ count_rows = sum (1 for _ in reader )
62+
63+ count = count_files + count_rows
4964 self .log (f"Estimated advisories to process: { count } " )
5065 return count
5166
5267 def collect_advisories (self ) -> Iterable [AdvisoryData ]:
53- """Collect fix commits from YAML statements under /statements."""
54- base_path = Path (self .vcs_response .dest_dir ) / "statements"
68+ self . log ( "Collecting fix commits from YAML statements under /statements...." )
69+ base_path = Path (self .vuln_data_branch_vcs .dest_dir ) / "statements"
5570
5671 for yaml_file in base_path .rglob ("*.yaml" ):
5772 if yaml_file .name != "statement.yaml" :
@@ -67,23 +82,30 @@ def collect_advisories(self) -> Iterable[AdvisoryData]:
6782 note_texts = []
6883 for note_entry in yaml_data .get ("notes" , []):
6984 text_content = note_entry .get ("text" )
70- if text_content :
71- note_texts .append (text_content )
85+ if not text_content :
86+ continue
87+ note_texts .append (text_content )
7288 description = "\n " .join (note_texts )
7389
7490 references = []
91+ affected_packages = []
92+ patches = []
7593 for fix in yaml_data .get ("fixes" , []):
7694 for commit in fix .get ("commits" , []):
77- commit_id = commit .get ("id" )
78- repo_url = commit .get ("repository" )
79- if not commit_id or not repo_url :
80- continue
81-
82- commit_url = repo_url .replace (".git" , "" ) + "/commit/" + commit_id
83- ref = ReferenceV2 .from_url (commit_url )
84- references .append (ref )
95+ commit_hash = commit .get ("id" )
96+ if not is_commit (commit_hash ):
97+ commit_hash = None
98+
99+ vcs_url = commit .get ("repository" )
100+ append_patch_classifications (
101+ url = vcs_url ,
102+ commit_hash = commit_hash ,
103+ patch_text = None ,
104+ affected_packages = affected_packages ,
105+ references = references ,
106+ patches = patches ,
107+ )
85108
86- affected_packages = []
87109 for artifact in yaml_data .get ("artifacts" , []):
88110 affected = artifact .get ("affected" )
89111 if not affected :
@@ -92,9 +114,20 @@ def collect_advisories(self) -> Iterable[AdvisoryData]:
92114 purl_str = artifact .get ("id" )
93115 purl = PackageURL .from_string (purl_str )
94116
117+ try :
118+ version_range_class = RANGE_CLASS_BY_SCHEMES .get (purl .type )
119+ version_class = (
120+ version_range_class .version_class if version_range_class else None
121+ )
122+ version_range = version_class (purl .version )
123+ except InvalidVersion :
124+ self .log (f"Invalid Version: { purl .version !r} for purl type: { purl .type !r} " )
125+ continue
126+
95127 affected_package = AffectedPackageV2 (
96128 package = PackageURL (type = purl .type , namespace = purl .namespace , name = purl .name ),
97- fixed_version_range = VersionRange .from_version (purl .version ),
129+ fixed_version_range = version_range if not affected else None ,
130+ affected_version_range = version_range if affected else None ,
98131 )
99132 affected_packages .append (affected_package )
100133
@@ -106,19 +139,55 @@ def collect_advisories(self) -> Iterable[AdvisoryData]:
106139
107140 yield AdvisoryData (
108141 advisory_id = vulnerability_id ,
109- aliases = [],
110- summary = description or "" ,
142+ summary = description ,
111143 affected_packages = affected_packages ,
112144 references_v2 = references ,
145+ patches = patches ,
113146 url = advisory_url ,
114- original_advisory_text = json .dumps (yaml_data , indent = 2 , ensure_ascii = False ),
147+ )
148+
149+ self .log ("Collecting fix commits from ProjectKB ( vulas_db_msr2019_release )..." )
150+ csv_path = (
151+ Path (self .main_branch_vcs .dest_dir ) / "MSR2019/dataset/vulas_db_msr2019_release.csv"
152+ )
153+
154+ with open (csv_path , newline = "" , encoding = "utf-8" ) as f :
155+ reader = csv .reader (f )
156+ next (reader , None ) # skip header
157+ rows = [r for r in reader if len (r ) == 4 and r [0 ]] # vuln_id, vcs_url, commit_hash, poc
158+
159+ for vuln_id , vcs_url , commit_hash , _ in rows :
160+ if not vuln_id or not vcs_url or not commit_hash :
161+ continue
162+
163+ patches = []
164+ affected_packages = []
165+ references = []
166+ append_patch_classifications (
167+ url = vcs_url ,
168+ commit_hash = commit_hash ,
169+ patch_text = None ,
170+ affected_packages = affected_packages ,
171+ references = references ,
172+ patches = patches ,
173+ )
174+
175+ yield AdvisoryData (
176+ advisory_id = vuln_id ,
177+ affected_packages = affected_packages ,
178+ patches = patches ,
179+ references_v2 = references ,
180+ url = "https://github.com/SAP/project-kb/blob/main/MSR2019/dataset/vulas_db_msr2019_release.csv" ,
115181 )
116182
117183 def clean_downloads (self ):
118184 """Remove the cloned repository from disk."""
119185 self .log ("Removing cloned repository..." )
120- if self .vcs_response :
121- self .vcs_response .delete ()
186+ if self .main_branch_vcs :
187+ self .main_branch_vcs .delete ()
188+
189+ if self .vuln_data_branch_vcs :
190+ self .vuln_data_branch_vcs .delete ()
122191
123192 def on_failure (self ):
124193 """Ensure cleanup happens on pipeline failure."""
0 commit comments