Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion minecode_pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
#


VERSION = "0.0.1b59"
VERSION = "0.0.1b60"
20 changes: 14 additions & 6 deletions minecode_pipelines/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def _mine_and_publish_packageurls(
commit_msg_func: Callable,
logger: Callable,
batch_size: int = 4000,
checkpoint_on_commit: bool = False,
checkpoint_func: Callable = None,
checkpoint_freq: int = 30,
):
Expand Down Expand Up @@ -192,19 +193,26 @@ def _mine_and_publish_packageurls(
)
checkout["file_to_commit"].add(purl_file)
checkout["file_processed_count"] += 1
if logger:
logger(f"{checkout['repo'].working_dir}: {checkout['file_processed_count']} / {batch_size}")

if len(checkout["file_to_commit"]) > batch_size:
if logger:
logger(f"Trying to commit PackageURLs.")
pipes.commit_and_push_checkout(
local_checkout=checkout,
commit_message=commit_msg_func(checkout["commit_count"] + 1),
logger=logger,
)

time_now = time.time()
checkpoint_due = time_now - last_checkpoint_call >= checkpoint_interval
if checkpoint_func and checkpoint_due:
checkpoint_func()
last_checkpoint_call = time_now
if checkpoint_on_commit and checkpoint_func:
checkpoint_func()

if not checkpoint_on_commit:
time_now = time.time()
checkpoint_due = time_now - last_checkpoint_call >= checkpoint_interval
if checkpoint_func and checkpoint_due:
checkpoint_func()
last_checkpoint_call = time_now

for checkout in checked_out_repos.values():
final_commit_count = checkout["commit_count"] + 1
Expand Down
4 changes: 2 additions & 2 deletions minecode_pipelines/pipelines/mine_maven.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def steps(cls):
cls.create_federatedcode_working_dir,
cls.fetch_federation_config,
cls.fetch_checkpoint_and_maven_index,
cls.mine_and_publish_alpine_packageurls,
cls.mine_and_publish_maven_packageurls,
cls.delete_working_dir,
)

Expand All @@ -61,7 +61,7 @@ def fetch_checkpoint_and_maven_index(self):
self.log(f"last_incremental: {last_incremental}")
self.maven_nexus_collector = maven.MavenNexusCollector(last_incremental=last_incremental)

def mine_and_publish_alpine_packageurls(self):
def mine_and_publish_maven_packageurls(self):
_mine_and_publish_packageurls(
packageurls=self.maven_nexus_collector.get_packages(),
total_package_count=None,
Expand Down
79 changes: 58 additions & 21 deletions minecode_pipelines/pipelines/mine_pypi.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,46 +20,83 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from scanpipe.pipelines import Pipeline
from scanpipe.pipes import federatedcode

from minecode_pipelines import pipes
from minecode_pipelines.pipes import pypi
from minecode_pipelines.pipelines import MineCodeBasePipeline
from minecode_pipelines.pipelines import _mine_and_publish_packageurls


class MinePypi(Pipeline):
class MinePypi(MineCodeBasePipeline):
"""
Mine all packageURLs from a pypi index and publish them to
a FederatedCode repo.
"""

package_batch_size = 100

@classmethod
def steps(cls):
return (
cls.check_federatedcode_eligibility,
cls.create_federatedcode_working_dir,
cls.mine_pypi_packages,
cls.mine_and_publish_pypi_packageurls,
cls.delete_cloned_repos,
cls.get_pypi_packages_to_sync,
cls.fetch_federation_config,
cls.mine_and_publish_packageurls,
cls.update_state_and_checkpoints,
cls.delete_working_dir,
)

def check_federatedcode_eligibility(self):
"""
Check if the project fulfills the following criteria for
pushing the project result to FederatedCode.
"""
federatedcode.check_federatedcode_configured_and_available(logger=self.log)

def mine_pypi_packages(self):
"""Mine pypi package names from pypi indexes or checkpoint."""
self.pypi_packages, self.state = pypi.mine_pypi_packages(logger=self.log)
self.pypi_packages, self.state, self.config_repo = pypi.mine_pypi_packages(logger=self.log)

def mine_and_publish_pypi_packageurls(self):
"""Get pypi packageURLs for all mined pypi package names."""
self.repos = pypi.mine_and_publish_pypi_packageurls(
def get_pypi_packages_to_sync(self):
"""Get pypi packages which needs to be synced using checkpoint."""
self.packages, self.last_serial = pypi.get_pypi_packages_to_sync(
packages_file=self.pypi_packages,
state=self.state,
logger=self.log,
)

def delete_cloned_repos(self):
pipes.delete_cloned_repos(repos=self.repos, logger=self.log)
def packages_count(self):
return len(self.packages)

def mine_packageurls(self):
"""Yield pypi packageURLs for all mined pypi package names."""
self.packages_mined = []
yield from pypi.mine_and_publish_pypi_packageurls(
packages_to_sync=self.packages,
packages_mined=self.packages_mined,
logger=self.log,
)

def save_check_point(self):
pypi.save_mined_packages_in_checkpoint(
packages_mined=self.packages_mined,
config_repo=self.config_repo,
logger=self.log,
)
self.packages_mined = []

def mine_and_publish_packageurls(self):
"""Mine and publish PackageURLs."""

_mine_and_publish_packageurls(
packageurls=self.mine_packageurls(),
total_package_count=self.packages_count(),
data_cluster=self.data_cluster,
checked_out_repos=self.checked_out_repos,
working_path=self.working_path,
append_purls=self.append_purls,
commit_msg_func=self.commit_message,
logger=self.log,
checkpoint_func=self.save_check_point,
checkpoint_on_commit=True,
batch_size=self.package_batch_size,
)

def update_state_and_checkpoints(self):
pypi.update_state_and_checkpoints(
config_repo=self.config_repo,
last_serial=self.last_serial,
logger=self.log,
)
9 changes: 5 additions & 4 deletions minecode_pipelines/pipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def get_checkpoint_from_file(cloned_repo, path):
return checkpoint_data or {}


def update_checkpoints_in_github(checkpoint, cloned_repo, path, logger):
def update_checkpoints_in_github(checkpoint, cloned_repo, path, logger=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Logger is always required while calling commit_and_push_changes. Also this https://github.com/aboutcode-org/scancode.io/blob/06bc5d56dd28509bf11694f14dea7f45c2b83684/scanpipe/pipes/federatedcode.py#L292 is a bug.

Suggested change
def update_checkpoints_in_github(checkpoint, cloned_repo, path, logger=None):
def update_checkpoints_in_github(checkpoint, cloned_repo, path, logger):

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This forces the use of logger throughout all the functions, which is not what we have elsewhere in SCIO, logging is turned on optionally and anything which is a error should be added as an error and not just handled as a log entry. And logging should be optional IMHO as a debug method, or to log progress.

from scanpipe.pipes.federatedcode import commit_and_push_changes

checkpoint_path = os.path.join(cloned_repo.working_dir, path)
Expand All @@ -65,16 +65,17 @@ def get_mined_packages_from_checkpoint(config_repo, checkpoint_path):
return checkpoint.get("packages_mined", [])


def update_mined_packages_in_checkpoint(packages, config_repo, cloned_repo, checkpoint_path):
def update_mined_packages_in_checkpoint(packages, config_repo, cloned_repo, checkpoint_path, logger=None):
mined_packages = get_mined_packages_from_checkpoint(
config_repo=config_repo,
checkpoint_path=checkpoint_path,
)
packages = {"packages_mined": packages + mined_packages}
packages_to_update = {"packages_mined": packages + mined_packages}
update_checkpoints_in_github(
checkpoint=packages,
checkpoint=packages_to_update,
cloned_repo=cloned_repo,
path=checkpoint_path,
logger=logger,
)


Expand Down
Loading