Skip to content

99 import dataset from renku aware repo #765

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Nov 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,3 +474,30 @@ def sleep_after():
import time
yield
time.sleep(0.5)


@pytest.fixture
def remote_project(data_repository, directory_tree):
"""A second Renku project with a dataset."""
from renku.cli import cli

runner = CliRunner()

with runner.isolated_filesystem() as project_path:
runner.invoke(cli, ['-S', 'init'])
result = runner.invoke(
cli, ['-S', 'dataset', 'create', 'remote-dataset']
)
assert 0 == result.exit_code

result = runner.invoke(
cli,
[
'-S', 'dataset', 'add', '-s', 'file', '-s', 'dir2',
'remote-dataset', directory_tree.strpath
],
catch_exceptions=False,
)
assert 0 == result.exit_code

yield runner, project_path
38 changes: 36 additions & 2 deletions renku/cli/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,27 @@
branch, commit, or tag. The value passed to this option must be a valid
reference in the remote Git repository.

Updating a dataset:

After adding files from a remote Git repository, you can check for updates in
those files by using ``renku dataset update`` command. This command checks all
remote files and copies over new content if there is any. It does not delete
files from the local dataset if they are deleted from the remote Git
repository; to force the delete use ``--delete`` argument. You can update to a
specific branch, commit, or tag by passing ``--ref`` option.

You can limit the scope of updated files by specifying dataset names, using
``--include`` and ``--exclude`` to filter based on file names, or using
``--creators`` to filter based on creators. For example, the following command
updates only CSV files from ``my-dataset``:

.. code-block:: console

$ renku dataset update -I '*.csv' my-dataset

Note that putting glob patterns in quotes is needed to tell Unix shell not
to expand them.

Tagging a dataset:

A dataset can be tagged with an arbitrary tag to refer to the dataset at that
Expand Down Expand Up @@ -649,8 +670,21 @@ def _init(lock, id_queue):
@click.option(
'--ref', default=None, help='Update to a specific commit/tag/branch.'
)
def update(names, creators, include, exclude, ref):
@click.option(
'--delete',
is_flag=True,
help='Delete local files that are deleted from remote.'
)
def update(names, creators, include, exclude, ref, delete):
"""Updates files in dataset from a remote Git repo."""
progress_context = partial(progressbar, label='Updating files')
update_datasets(names, creators, include, exclude, ref, progress_context)
update_datasets(
names=names,
creators=creators,
include=include,
exclude=exclude,
ref=ref,
delete=delete,
progress_context=progress_context
)
click.secho('OK', fg='green')
2 changes: 2 additions & 0 deletions renku/cli/exception_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ def main(self, *args, **kwargs):
return super().main(*args, **kwargs)
except RenkuException as e:
click.echo('Error: {}'.format(e))
if e.__cause__ is not None:
click.echo('\n{}'.format(traceback.format_exc()))
exit_code = 1
if isinstance(e, (ParameterError, UsageError)):
exit_code = 2
Expand Down
34 changes: 29 additions & 5 deletions renku/core/commands/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from renku.core.commands.providers import ProviderFactory
from renku.core.compat import contextlib
from renku.core.errors import DatasetNotFound, InvalidAccessToken, \
MigrationRequired, ParameterError
MigrationRequired, ParameterError, UsageError
from renku.core.management.datasets import DATASET_METADATA_PATHS
from renku.core.management.git import COMMIT_DIFF_STRATEGY
from renku.core.models.creators import Creator
Expand Down Expand Up @@ -166,6 +166,14 @@ def add_to_dataset(
urlscontext=contextlib.nullcontext
):
"""Add data to a dataset."""
if sources or destination:
if len(urls) == 0:
raise UsageError('No URL is specified')
elif len(urls) > 1:
raise UsageError(
'Cannot add multiple URLs with --source or --destination'
)

# check for identifier before creating the dataset
identifier = extract_doi(
with_metadata.identifier
Expand Down Expand Up @@ -207,8 +215,10 @@ def add_to_dataset(
'"renku dataset add {0}" command with "--create" option for '
'automatic dataset creation.'.format(name)
)
except (FileNotFoundError, git.exc.NoSuchPathError):
raise ParameterError('Could not process \n{0}'.format('\n'.join(urls)))
except (FileNotFoundError, git.exc.NoSuchPathError) as e:
raise ParameterError(
'Could not find paths/URLs: \n{0}'.format('\n'.join(urls))
) from e
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TIL raise...from :)



@pass_local_client(clean=False, commit=False)
Expand Down Expand Up @@ -507,14 +517,20 @@ def import_dataset(
)


@pass_local_client(clean=True, commit=True, commit_only=DATASET_METADATA_PATHS)
@pass_local_client(
clean=True,
commit=True,
commit_only=DATASET_METADATA_PATHS,
commit_empty=False
)
def update_datasets(
client,
names,
creators,
include,
exclude,
ref,
delete,
progress_context=contextlib.nullcontext
):
"""Update files from a remote Git repo."""
Expand Down Expand Up @@ -556,7 +572,15 @@ def update_datasets(
with progress_context(
possible_updates, item_show_func=lambda x: x.path if x else None
) as progressbar:
client.update_dataset_files(progressbar, ref)
deleted_files = client.update_dataset_files(
files=progressbar, ref=ref, delete=delete
)

if deleted_files and not delete:
click.echo(
'Some files are deleted from remote. To also delete them locally '
'run update command with `--delete` flag.'
)


def _include_exclude(file_path, include=None, exclude=None):
Expand Down
14 changes: 10 additions & 4 deletions renku/core/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,11 @@ def __init__(self, message, param_hint=None):
if param_hint:
if isinstance(param_hint, (tuple, list)):
param_hint = ' / '.join('"{}"'.format(x) for x in param_hint)
message = 'Invalid value for {}: {}'.format(param_hint, message)
message = 'Invalid parameter value for {}: {}'.format(
param_hint, message
)
else:
message = 'Invalid value: {}'.format(message)
message = 'Invalid parameter value: {}'.format(message)

super().__init__(message)

Expand Down Expand Up @@ -365,5 +367,9 @@ class GitError(RenkuException):
"""Raised when a remote Git repo cannot be accessed."""


class UrlSchemaNotSupported(RenkuException):
"""Raised when adding data from unsupported URL schemas."""
class UrlSchemeNotSupported(RenkuException):
"""Raised when adding data from unsupported URL schemes."""


class OperationError(RenkuException):
"""Raised when an operation at runtime raises an error."""
Loading