Skip to content

Commit 8e226fb

Browse files
authored
Merge pull request #44 from aboutcode-org/sync-scancode-scan
Add pipeline to advertise scancode.io scans
2 parents 365bf5e + 0943842 commit 8e226fb

File tree

94 files changed

+504
-2510
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

94 files changed

+504
-2510
lines changed

fedcode/activitypub.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@
7979
"purl-ap-profile": "purl_string",
8080
"review-page": "review_id",
8181
"repository-page": "repository_id",
82-
"note-page": "note_id",
82+
"note-page": "uuid",
8383
"vulnerability-page": "vulnerability_id",
8484
}
8585

Lines changed: 4 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,34 +6,16 @@
66
# See https://github.com/nexB/federatedcode for support or download.
77
# See https://aboutcode.org for more information about AboutCode.org OSS projects.
88
#
9+
10+
from traceback import format_exc as traceback_format_exc
11+
912
from django.core.management.base import BaseCommand
10-
from django.core.management.base import CommandError
1113

12-
from fedcode.importer import Importer
1314
from fedcode.models import FederateRequest
14-
from fedcode.models import SyncRequest
1515
from fedcode.signatures import FEDERATEDCODE_PRIVATE_KEY
1616
from fedcode.signatures import HttpSignature
1717

1818

19-
def sync_task():
20-
"""
21-
sync_task is a task to run the Importer and save the status
22-
"""
23-
for sync_r in SyncRequest.objects.all().order_by("created_at"):
24-
if not sync_r.done:
25-
try:
26-
repo = sync_r.repo
27-
repo.git_repo_obj.remotes.origin.pull()
28-
importer = Importer(repo, repo.admin)
29-
importer.run()
30-
sync_r.done = True
31-
except Exception as e:
32-
sync_r.error_message = e
33-
finally:
34-
sync_r.save()
35-
36-
3719
def send_fed_req_task():
3820
"""
3921
send_fed_req_task is a task to send the http signed request to the target and save the status of the request
@@ -53,11 +35,5 @@ def send_fed_req_task():
5335

5436

5537
class Command(BaseCommand):
56-
def add_arguments(self, parser):
57-
parser.add_argument("task", choices=["sync", "federate"])
58-
5938
def handle(self, *args, **options):
60-
if options["task"] == "sync":
61-
sync_task()
62-
elif options["task"] == "federate":
63-
send_fed_req_task()
39+
send_fed_req_task()
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# FederatedCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/federatedcode for support or download.
7+
# See https://aboutcode.org for more information about AboutCode.org OSS projects.
8+
#
9+
10+
from django.core.management.base import BaseCommand
11+
from django.core.management.base import CommandError
12+
13+
from fedcode.pipelines import sync_scancode_scans
14+
from fedcode.pipelines import sync_vulnerablecode
15+
16+
SYNC_REGISTRY = [
17+
sync_scancode_scans.SyncScanCodeScans,
18+
sync_vulnerablecode.SyncVulnerableCode,
19+
]
20+
21+
SYNC_REGISTRY = {x.pipeline_id: x for x in SYNC_REGISTRY}
22+
23+
24+
class Command(BaseCommand):
25+
help = "Sync metadata from git repository"
26+
27+
def add_arguments(self, parser):
28+
parser.add_argument(
29+
"--list",
30+
action="store_true",
31+
help="List available pipelines",
32+
)
33+
parser.add_argument("--all", action="store_true", help="Sync all repo data.")
34+
35+
parser.add_argument("pipelines", nargs="*", help="Pipeline ID")
36+
37+
def handle(self, *args, **options):
38+
try:
39+
if options["list"]:
40+
self.list_pipelines()
41+
elif options["all"]:
42+
self.import_data(pipelines=SYNC_REGISTRY.values())
43+
else:
44+
pipelines = options["pipelines"]
45+
if not pipelines:
46+
raise CommandError(
47+
'Please provide at least one pipeline to execute or use "--all".'
48+
)
49+
self.import_data(validate_pipelines(pipelines))
50+
except KeyboardInterrupt:
51+
raise CommandError("Keyboard interrupt received. Stopping...")
52+
53+
def list_pipelines(self):
54+
self.stdout.write("Metadata can be synced from the following pipelines:")
55+
self.stdout.write("\n".join(SYNC_REGISTRY))
56+
57+
def import_data(self, pipelines):
58+
"""Execute the given ``pipeline``."""
59+
failed_pipelines = []
60+
61+
for pipeline in pipelines:
62+
self.stdout.write(f"Syncing data using {pipeline.pipeline_id}")
63+
status, error = pipeline().execute()
64+
if status != 0:
65+
self.stdout.write(error)
66+
failed_pipelines.append(pipeline.pipeline_id)
67+
68+
if failed_pipelines:
69+
raise CommandError(f"{len(failed_pipelines)} failed!: {','.join(failed_pipelines)}")
70+
71+
72+
def validate_pipelines(pipelines):
73+
validated_pipelines = []
74+
unknown_pipelines = []
75+
for pipeline in pipelines:
76+
try:
77+
validated_pipelines.append(SYNC_REGISTRY[pipeline])
78+
except KeyError:
79+
unknown_pipelines.append(pipeline)
80+
if unknown_pipelines:
81+
raise CommandError(f"Unknown pipelines: {unknown_pipelines}")
82+
83+
return validated_pipelines

fedcode/pipelines/__init__.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# FederatedCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/federatedcode for support or download.
7+
# See https://aboutcode.org for more information about AboutCode.org OSS projects.
8+
#
9+
10+
import logging
11+
from datetime import datetime
12+
from datetime import timezone
13+
from timeit import default_timer as timer
14+
15+
from aboutcode.pipeline import BasePipeline
16+
from aboutcode.pipeline import humanize_time
17+
18+
module_logger = logging.getLogger(__name__)
19+
20+
21+
class classproperty(object):
22+
def __init__(self, fget):
23+
self.fget = fget
24+
25+
def __get__(self, owner_self, owner_cls):
26+
return self.fget(owner_cls)
27+
28+
29+
class FederatedCodePipeline(BasePipeline):
30+
pipeline_id = None # Unique Pipeline ID
31+
32+
def on_failure(self):
33+
"""
34+
Tasks to run in the event that pipeline execution fails.
35+
36+
Implement cleanup or other tasks that need to be performed
37+
on pipeline failure, such as:
38+
- Removing cloned repositories.
39+
- Deleting downloaded archives.
40+
"""
41+
pass
42+
43+
def execute(self):
44+
"""Execute each steps in the order defined on this pipeline class."""
45+
self.log(f"Pipeline [{self.pipeline_name}] starting")
46+
47+
steps = self.pipeline_class.get_steps(groups=self.selected_groups)
48+
steps_count = len(steps)
49+
pipeline_start_time = timer()
50+
51+
for current_index, step in enumerate(steps, start=1):
52+
step_name = step.__name__
53+
54+
if self.selected_steps and step_name not in self.selected_steps:
55+
self.log(f"Step [{step_name}] skipped")
56+
continue
57+
58+
self.set_current_step(f"{current_index}/{steps_count} {step_name}")
59+
self.log(f"Step [{step_name}] starting")
60+
step_start_time = timer()
61+
62+
try:
63+
step(self)
64+
except Exception as exception:
65+
self.log("Pipeline failed")
66+
on_failure_start_time = timer()
67+
self.log(f"Running [on_failure] tasks")
68+
self.on_failure()
69+
on_failure_run_time = timer() - on_failure_start_time
70+
self.log(f"Completed [on_failure] tasks in {humanize_time(on_failure_run_time)}")
71+
72+
return 1, self.output_from_exception(exception)
73+
74+
step_run_time = timer() - step_start_time
75+
self.log(f"Step [{step_name}] completed in {humanize_time(step_run_time)}")
76+
77+
self.set_current_step("") # Reset the `current_step` field on completion
78+
pipeline_run_time = timer() - pipeline_start_time
79+
self.log(f"Pipeline completed in {humanize_time(pipeline_run_time)}")
80+
81+
return 0, ""
82+
83+
def log(self, message, level=logging.INFO):
84+
"""Log the given `message` to the current module logger and execution_log."""
85+
now_local = datetime.now(timezone.utc).astimezone()
86+
timestamp = now_local.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
87+
message = f"{timestamp} {message}"
88+
module_logger.log(level, message)
89+
self.append_to_log(message)
90+
91+
@classproperty
92+
def pipeline_id(cls):
93+
"""Return unique pipeline_id set in cls.pipeline_id"""
94+
95+
if cls.pipeline_id is None or cls.pipeline_id == "":
96+
raise NotImplementedError("pipeline_id is not defined or is empty")
97+
return cls.pipeline_id
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# FederatedCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/federatedcode for support or download.
7+
# See https://aboutcode.org for more information about AboutCode.org OSS projects.
8+
#
9+
10+
from pathlib import Path
11+
from traceback import format_exc as traceback_format_exc
12+
13+
from aboutcode.pipeline import LoopProgress
14+
15+
from fedcode.models import Package
16+
from fedcode.models import Repository
17+
from fedcode.pipelines import FederatedCodePipeline
18+
from fedcode.pipes import utils
19+
20+
21+
class SyncScanCodeScans(FederatedCodePipeline):
22+
"""Sync Package scans from FederatedCode git repositories."""
23+
24+
pipeline_id = "sync_scancode_scans"
25+
26+
@classmethod
27+
def steps(cls):
28+
return (
29+
cls.get_git_repos,
30+
cls.sync_scan_repositories,
31+
)
32+
33+
def get_git_repos(self):
34+
self.git_repos = Repository.objects.all()
35+
36+
def sync_scan_repositories(self):
37+
repositories_count = self.git_repos.count()
38+
self.log(f"Syncing package scans from {repositories_count:,d} repositories")
39+
40+
synced_package_scan_count = 0
41+
progress = LoopProgress(total_iterations=repositories_count, logger=self.log)
42+
for repository in progress.iter(self.git_repos.iterator(chunk_size=2000)):
43+
repository.git_repo_obj.remotes.origin.pull()
44+
synced_package_scan_count += sync_scancodeio_scan(
45+
repository=repository,
46+
logger=self.log,
47+
)
48+
49+
self.log(f"Successfully synced {synced_package_scan_count:,d} package scans")
50+
51+
52+
def sync_scancodeio_scan(repository, logger):
53+
repo = repository.git_repo_obj
54+
latest_commit_hash = repo.head.commit.hexsha
55+
latest_commit = repo.commit(latest_commit_hash)
56+
57+
if last_commit_hash := repository.last_imported_commit:
58+
last_imported_commit = repo.commit(last_commit_hash)
59+
diffs = last_imported_commit.diff(latest_commit)
60+
scans = [item for item in diffs if item.a_path.endswith("scancodeio.json")]
61+
scan_count = sync_scan_from_diff(diffs=scans, repository=repository, logger=logger)
62+
else:
63+
scan_count = sync_all_scan(repository=repository, logger=logger)
64+
65+
repository.last_imported_commit = latest_commit_hash
66+
repository.save()
67+
68+
return scan_count
69+
70+
71+
def sync_scan_from_diff(diffs, repository, logger):
72+
scans = [
73+
item
74+
for item in diffs
75+
if item.a_path.endswith("scancodeio.json") or item.b_path.endswith("scancodeio.json")
76+
]
77+
scan_count = len(scans)
78+
79+
logger(f"Syncing {scan_count:,d} package scan from {repository.url}")
80+
progress = LoopProgress(total_iterations=scan_count, logger=logger)
81+
for scan in progress.iter(scans):
82+
change_type = scan.change_type
83+
if change_type in ("A", "M", "R"):
84+
scan_path = scan.b_path
85+
action = utils.create_note
86+
elif change_type == "D":
87+
scan_path = scan.a_path
88+
action = utils.delete_note
89+
90+
purl = utils.package_metadata_path_to_purl(path=Path(scan_path), version=False)
91+
package, _ = Package.objects.get_or_create(purl=str(purl), service=repository.admin)
92+
note = utils.get_scan_note(path=Path(scan_path))
93+
action(pkg=package, note_dict=note)
94+
return scan_count
95+
96+
97+
def sync_all_scan(repository, logger):
98+
repo = repository.git_repo_obj
99+
root = Path(repo.working_dir)
100+
scan_count = sum(1 for _ in root.rglob("scancodeio.json"))
101+
102+
scans = root.rglob("scancodeio.json")
103+
logger(f"Syncing {scan_count:,d} package scan from {repo.remotes.origin.url}")
104+
105+
progress = LoopProgress(total_iterations=scan_count, logger=logger)
106+
for scan in progress.iter(scans):
107+
relative_path = scan.relative_to(root)
108+
purl = utils.package_metadata_path_to_purl(relative_path, version=False)
109+
package, _ = Package.objects.get_or_create(purl=str(purl), service=repository.admin)
110+
note = utils.get_scan_note(path=relative_path)
111+
utils.create_note(pkg=package, note_dict=note)
112+
return scan_count

0 commit comments

Comments
 (0)