|
8 | 8 | # |
9 | 9 |
|
10 | 10 | import logging |
| 11 | +import re |
| 12 | +import shutil |
| 13 | +import tempfile |
11 | 14 | import traceback |
| 15 | +from collections import defaultdict |
12 | 16 | from datetime import datetime |
13 | 17 | from datetime import timezone |
14 | 18 | from timeit import default_timer as timer |
|
19 | 23 | from aboutcode.pipeline import LoopProgress |
20 | 24 | from aboutcode.pipeline import PipelineDefinition |
21 | 25 | from aboutcode.pipeline import humanize_time |
| 26 | +from git import Repo |
| 27 | +from packageurl.contrib.url2purl import url2purl |
22 | 28 |
|
23 | 29 | from vulnerabilities.importer import AdvisoryData |
| 30 | +from vulnerabilities.importer import AffectedPackageV2 |
| 31 | +from vulnerabilities.importer import PackageCommitPatchData |
24 | 32 | from vulnerabilities.improver import MAX_CONFIDENCE |
25 | 33 | from vulnerabilities.models import Advisory |
26 | 34 | from vulnerabilities.models import PipelineRun |
@@ -321,3 +329,109 @@ def collect_and_store_advisories(self): |
321 | 329 | continue |
322 | 330 |
|
323 | 331 | self.log(f"Successfully collected {collected_advisory_count:,d} advisories") |
| 332 | + |
| 333 | + |
| 334 | +class CollectVCSFixCommitPipeline(VulnerableCodeBaseImporterPipelineV2): |
| 335 | + """ |
| 336 | + Pipeline to collect fix commits from any git repository. |
| 337 | + """ |
| 338 | + |
| 339 | + repo_url: str |
| 340 | + patterns: list[str] = [ |
| 341 | + r"\bCVE-\d{4}-\d{4,19}\b", |
| 342 | + r"GHSA-[2-9cfghjmpqrvwx]{4}-[2-9cfghjmpqrvwx]{4}-[2-9cfghjmpqrvwx]{4}", |
| 343 | + ] |
| 344 | + |
| 345 | + @classmethod |
| 346 | + def steps(cls): |
| 347 | + return ( |
| 348 | + cls.clone, |
| 349 | + cls.collect_and_store_advisories, |
| 350 | + cls.clean_downloads, |
| 351 | + ) |
| 352 | + |
| 353 | + def clone(self): |
| 354 | + """Clone the repository.""" |
| 355 | + self.repo = Repo.clone_from( |
| 356 | + url=self.repo_url, |
| 357 | + to_path=tempfile.mkdtemp(), |
| 358 | + bare=True, |
| 359 | + no_checkout=True, |
| 360 | + multi_options=["--filter=blob:none"], |
| 361 | + ) |
| 362 | + |
| 363 | + def advisories_count(self) -> int: |
| 364 | + return 0 |
| 365 | + |
| 366 | + def extract_vulnerability_id(self, commit) -> list[str]: |
| 367 | + """ |
| 368 | + Extract vulnerability id from a commit message. |
| 369 | + Returns a list of matched vulnerability IDs |
| 370 | + """ |
| 371 | + matches = [] |
| 372 | + for pattern in self.patterns: |
| 373 | + found = re.findall(pattern, commit.message, flags=re.IGNORECASE) |
| 374 | + matches.extend(found) |
| 375 | + return matches |
| 376 | + |
| 377 | + def collect_fix_commits(self): |
| 378 | + """ |
| 379 | + Iterate through repository commits and group them by vulnerability identifiers. |
| 380 | + return a list with (vuln_id, [(commit_id, commit_message)]). |
| 381 | + """ |
| 382 | + self.log("Processing git repository fix commits (grouped by vulnerability IDs).") |
| 383 | + |
| 384 | + grouped_commits = defaultdict(list) |
| 385 | + for commit in self.repo.iter_commits("--all"): |
| 386 | + matched_ids = self.extract_vulnerability_id(commit) |
| 387 | + if not matched_ids: |
| 388 | + continue |
| 389 | + |
| 390 | + commit_id = commit.hexsha |
| 391 | + commit_message = commit.message.strip() |
| 392 | + |
| 393 | + for vuln_id in matched_ids: |
| 394 | + grouped_commits[vuln_id].append((commit_id, commit_message)) |
| 395 | + |
| 396 | + self.log(f"Found {len(grouped_commits)} vulnerabilities with related commits.") |
| 397 | + self.log("Finished processing all commits.") |
| 398 | + return grouped_commits |
| 399 | + |
| 400 | + def collect_advisories(self): |
| 401 | + """ |
| 402 | + Generate AdvisoryData objects for each vulnerability ID grouped with its related commits. |
| 403 | + """ |
| 404 | + self.log("Generating AdvisoryData objects from grouped commits.") |
| 405 | + grouped_commits = self.collect_fix_commits() |
| 406 | + purl = url2purl(self.repo_url) |
| 407 | + |
| 408 | + for vuln_id, commits_data in grouped_commits.items(): |
| 409 | + if not commits_data or not vuln_id: |
| 410 | + continue |
| 411 | + |
| 412 | + commit_hash_set = {commit_hash for commit_hash, _ in commits_data} |
| 413 | + affected_packages = [ |
| 414 | + AffectedPackageV2( |
| 415 | + package=purl, |
| 416 | + fixed_by_commit_patches=[ |
| 417 | + PackageCommitPatchData(vcs_url=self.repo_url, commit_hash=commit_hash) |
| 418 | + for commit_hash in commit_hash_set |
| 419 | + ], |
| 420 | + ) |
| 421 | + ] |
| 422 | + |
| 423 | + yield AdvisoryData( |
| 424 | + advisory_id=vuln_id, |
| 425 | + affected_packages=affected_packages, |
| 426 | + url=self.repo_url, |
| 427 | + ) |
| 428 | + |
| 429 | + def clean_downloads(self): |
| 430 | + """Cleanup any temporary repository data.""" |
| 431 | + self.log("Cleaning up local repository resources.") |
| 432 | + if hasattr(self, "repo") and self.repo.working_dir: |
| 433 | + shutil.rmtree(path=self.repo.working_dir) |
| 434 | + |
| 435 | + def on_failure(self): |
| 436 | + """Ensure cleanup is always performed on failure.""" |
| 437 | + self.clean_downloads() |
0 commit comments