Skip to content

Commit 528857e

Browse files
committed
Update commit parsing pipeline to support collecting fix commits from multiple repositories
Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent 12dc381 commit 528857e

File tree

6 files changed

+220
-155
lines changed

6 files changed

+220
-155
lines changed

vulnerabilities/importers/__init__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
from vulnerabilities.pipelines.v2_importers import aosp_importer as aosp_importer_v2
4545
from vulnerabilities.pipelines.v2_importers import apache_httpd_importer as apache_httpd_v2
4646
from vulnerabilities.pipelines.v2_importers import archlinux_importer as archlinux_importer_v2
47+
from vulnerabilities.pipelines.v2_importers import collect_fix_commits as collect_fix_commits_v2
4748
from vulnerabilities.pipelines.v2_importers import curl_importer as curl_importer_v2
4849
from vulnerabilities.pipelines.v2_importers import (
4950
elixir_security_importer as elixir_security_importer_v2,
@@ -135,5 +136,19 @@
135136
ubuntu_usn.UbuntuUSNImporter,
136137
fireeye.FireyeImporter,
137138
oss_fuzz.OSSFuzzImporter,
139+
collect_fix_commits_v2.CollectNodejsFixCommitsPipeline,
140+
collect_fix_commits_v2.CollectCpythonFixCommitsPipeline,
141+
collect_fix_commits_v2.CollectGoFixCommitsPipeline,
142+
collect_fix_commits_v2.CollectRustFixCommitsPipeline,
143+
collect_fix_commits_v2.CollectPhpFixCommitsPipeline,
144+
collect_fix_commits_v2.CollectRubyFixCommitsPipeline,
145+
collect_fix_commits_v2.CollectNginxFixCommitsPipeline,
146+
collect_fix_commits_v2.CollectPostgresFixCommitsPipeline,
147+
collect_fix_commits_v2.CollectMysqlFixCommitsPipeline,
148+
collect_fix_commits_v2.CollectGitFixCommitsPipeline,
149+
collect_fix_commits_v2.CollectTensorflowFixCommitsPipeline,
150+
collect_fix_commits_v2.CollectFirefoxFixCommitsPipeline,
151+
collect_fix_commits_v2.CollectQEMUFixCommitsPipeline,
152+
collect_fix_commits_v2.CollectDenoFixCommitsPipeline,
138153
]
139154
)

vulnerabilities/pipelines/__init__.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88
#
99

1010
import logging
11+
import re
12+
import shutil
13+
import tempfile
1114
import traceback
15+
from collections import defaultdict
1216
from datetime import datetime
1317
from datetime import timezone
1418
from timeit import default_timer as timer
@@ -19,8 +23,12 @@
1923
from aboutcode.pipeline import LoopProgress
2024
from aboutcode.pipeline import PipelineDefinition
2125
from aboutcode.pipeline import humanize_time
26+
from git import Repo
27+
from packageurl.contrib.url2purl import url2purl
2228

2329
from vulnerabilities.importer import AdvisoryData
30+
from vulnerabilities.importer import AffectedPackageV2
31+
from vulnerabilities.importer import PackageCommitPatchData
2432
from vulnerabilities.improver import MAX_CONFIDENCE
2533
from vulnerabilities.models import Advisory
2634
from vulnerabilities.models import PipelineRun
@@ -321,3 +329,109 @@ def collect_and_store_advisories(self):
321329
continue
322330

323331
self.log(f"Successfully collected {collected_advisory_count:,d} advisories")
332+
333+
334+
class CollectVCSFixCommitPipeline(VulnerableCodeBaseImporterPipelineV2):
335+
"""
336+
Pipeline to collect fix commits from any git repository.
337+
"""
338+
339+
repo_url: str
340+
patterns: list[str] = [
341+
r"\bCVE-\d{4}-\d{4,19}\b",
342+
r"GHSA-[2-9cfghjmpqrvwx]{4}-[2-9cfghjmpqrvwx]{4}-[2-9cfghjmpqrvwx]{4}",
343+
]
344+
345+
@classmethod
346+
def steps(cls):
347+
return (
348+
cls.clone,
349+
cls.collect_and_store_advisories,
350+
cls.clean_downloads,
351+
)
352+
353+
def clone(self):
354+
"""Clone the repository."""
355+
self.repo = Repo.clone_from(
356+
url=self.repo_url,
357+
to_path=tempfile.mkdtemp(),
358+
bare=True,
359+
no_checkout=True,
360+
multi_options=["--filter=blob:none"],
361+
)
362+
363+
def advisories_count(self) -> int:
364+
return 0
365+
366+
def extract_vulnerability_id(self, commit) -> list[str]:
367+
"""
368+
Extract vulnerability id from a commit message.
369+
Returns a list of matched vulnerability IDs
370+
"""
371+
matches = []
372+
for pattern in self.patterns:
373+
found = re.findall(pattern, commit.message, flags=re.IGNORECASE)
374+
matches.extend(found)
375+
return matches
376+
377+
def collect_fix_commits(self):
378+
"""
379+
Iterate through repository commits and group them by vulnerability identifiers.
380+
return a list with (vuln_id, [(commit_id, commit_message)]).
381+
"""
382+
self.log("Processing git repository fix commits (grouped by vulnerability IDs).")
383+
384+
grouped_commits = defaultdict(list)
385+
for commit in self.repo.iter_commits("--all"):
386+
matched_ids = self.extract_vulnerability_id(commit)
387+
if not matched_ids:
388+
continue
389+
390+
commit_id = commit.hexsha
391+
commit_message = commit.message.strip()
392+
393+
for vuln_id in matched_ids:
394+
grouped_commits[vuln_id].append((commit_id, commit_message))
395+
396+
self.log(f"Found {len(grouped_commits)} vulnerabilities with related commits.")
397+
self.log("Finished processing all commits.")
398+
return grouped_commits
399+
400+
def collect_advisories(self):
401+
"""
402+
Generate AdvisoryData objects for each vulnerability ID grouped with its related commits.
403+
"""
404+
self.log("Generating AdvisoryData objects from grouped commits.")
405+
grouped_commits = self.collect_fix_commits()
406+
purl = url2purl(self.repo_url)
407+
408+
for vuln_id, commits_data in grouped_commits.items():
409+
if not commits_data or not vuln_id:
410+
continue
411+
412+
commit_hash_set = {commit_hash for commit_hash, _ in commits_data}
413+
affected_packages = [
414+
AffectedPackageV2(
415+
package=purl,
416+
fixed_by_commit_patches=[
417+
PackageCommitPatchData(vcs_url=self.repo_url, commit_hash=commit_hash)
418+
for commit_hash in commit_hash_set
419+
],
420+
)
421+
]
422+
423+
yield AdvisoryData(
424+
advisory_id=vuln_id,
425+
affected_packages=affected_packages,
426+
url=self.repo_url,
427+
)
428+
429+
def clean_downloads(self):
430+
"""Cleanup any temporary repository data."""
431+
self.log("Cleaning up local repository resources.")
432+
if hasattr(self, "repo") and self.repo.working_dir:
433+
shutil.rmtree(path=self.repo.working_dir)
434+
435+
def on_failure(self):
436+
"""Ensure cleanup is always performed on failure."""
437+
self.clean_downloads()
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
from vulnerabilities.pipelines import CollectVCSFixCommitPipeline
2+
3+
4+
class CollectNodejsFixCommitsPipeline(CollectVCSFixCommitPipeline):
5+
pipeline_id = "collect_nodejs_fix_commits"
6+
repo_url = "https://github.com/nodejs/node"
7+
8+
9+
class CollectCpythonFixCommitsPipeline(CollectVCSFixCommitPipeline):
10+
pipeline_id = "collect_cpython_fix_commits"
11+
repo_url = "https://github.com/python/cpython"
12+
13+
14+
class CollectGoFixCommitsPipeline(CollectVCSFixCommitPipeline):
15+
pipeline_id = "collect_go_fix_commits"
16+
repo_url = "https://github.com/golang/go"
17+
18+
19+
class CollectRustFixCommitsPipeline(CollectVCSFixCommitPipeline):
20+
pipeline_id = "collect_rust_lang_fix_commits"
21+
repo_url = "https://github.com/rust-lang/rust"
22+
23+
24+
class CollectPhpFixCommitsPipeline(CollectVCSFixCommitPipeline):
25+
pipeline_id = "collect_php_fix_commits"
26+
repo_url = "https://github.com/php/php-src"
27+
28+
29+
class CollectRubyFixCommitsPipeline(CollectVCSFixCommitPipeline):
30+
pipeline_id = "collect_ruby_fix_commits"
31+
repo_url = "https://github.com/ruby/ruby"
32+
33+
34+
class CollectNginxFixCommitsPipeline(CollectVCSFixCommitPipeline):
35+
pipeline_id = "collect_nginx_fix_commits"
36+
repo_url = "https://github.com/nginx/nginx"
37+
38+
39+
class CollectPostgresFixCommitsPipeline(CollectVCSFixCommitPipeline):
40+
pipeline_id = "collect_postgres_fix_commits"
41+
repo_url = "https://github.com/postgres/postgres"
42+
43+
44+
class CollectMysqlFixCommitsPipeline(CollectVCSFixCommitPipeline):
45+
pipeline_id = "collect_mysql_fix_commits"
46+
repo_url = "https://github.com/mysql/mysql-server"
47+
48+
49+
class CollectGitFixCommitsPipeline(CollectVCSFixCommitPipeline):
50+
pipeline_id = "collect_git_fix_commits"
51+
repo_url = "https://github.com/git/git"
52+
53+
54+
class CollectTensorflowFixCommitsPipeline(CollectVCSFixCommitPipeline):
55+
pipeline_id = "collect_tensorflow_fix_commits"
56+
repo_url = "https://github.com/tensorflow/tensorflow"
57+
58+
59+
class CollectFirefoxFixCommitsPipeline(CollectVCSFixCommitPipeline):
60+
pipeline_id = "collect_firefox_fix_commits"
61+
repo_url = "https://github.com/mozilla-firefox/firefox"
62+
63+
64+
class CollectQEMUFixCommitsPipeline(CollectVCSFixCommitPipeline):
65+
pipeline_id = "collect_qemu_fix_commits"
66+
repo_url = "https://github.com/qemu/qemu"
67+
68+
69+
class CollectDenoFixCommitsPipeline(CollectVCSFixCommitPipeline):
70+
pipeline_id = "collect_deno_fix_commits"
71+
repo_url = "https://github.com/denoland/deno"

vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py

Lines changed: 0 additions & 129 deletions
This file was deleted.

0 commit comments

Comments
 (0)