Skip to content

Commit

Permalink
Imgspec/copy auto (#2731)
Browse files Browse the repository at this point in the history
Signed-off-by: Yee Hing Tong <wild-endeavor@users.noreply.github.com>
  • Loading branch information
wild-endeavor authored Sep 10, 2024
1 parent 15d82ef commit 26559fa
Show file tree
Hide file tree
Showing 18 changed files with 225 additions and 67 deletions.
2 changes: 2 additions & 0 deletions docs/source/design/clis.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ The ``serialize`` command is deprecated around the end of Q3 2024. Users should
Migrate
-------
To use the ``package`` command, make the following changes:

* The ``--local-source-root`` option should be changed to ``--source``
* If the already ``--in-container-virtualenv-root`` option was specified, then move to the ``--python-interpreter`` option in ``package``. The default Python interpreter for serialize was based on this deprecated flag, and if not specified, ``sys.executable``. The default for ``package`` is ``/opt/venv/bin/python3``. If that is not where the Python interpreter is located in the task container, then you'll need to now specify ``--python-interpreter``. Note that this was only used for Spark tasks.
* The ``--in-container-config-path`` option should be removed as this was not actually being used by the ``serialize`` command.
Expand All @@ -121,5 +122,6 @@ To use the ``package`` command, make the following changes:
Functional Changes
------------------
Beyond the options, the ``package`` command differs in that

* Whether or not to use fast register should be specified by the ``--copy auto`` or ``--copy all`` flags, rather than ``fast`` being a subcommand.
* The serialized file output by default is in a .tgz file, rather than being separate files. This means that any subsequent ``flytectl register`` command will need to be updated with the ``--archive`` flag.
2 changes: 1 addition & 1 deletion flytekit/clis/sdk_in_container/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from flytekit.clis.sdk_in_container.constants import CTX_CONFIG_FILE
from flytekit.configuration import ImageConfig
from flytekit.configuration.plugin import get_plugin
from flytekit.constants import CopyFileDetection
from flytekit.remote.remote import FlyteRemote
from flytekit.tools.fast_registration import CopyFileDetection

FLYTE_REMOTE_INSTANCE_KEY = "flyte_remote"

Expand Down
3 changes: 2 additions & 1 deletion flytekit/clis/sdk_in_container/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
ImageConfig,
SerializationSettings,
)
from flytekit.constants import CopyFileDetection
from flytekit.interaction.click_types import key_value_callback
from flytekit.tools.fast_registration import CopyFileDetection, FastPackageOptions
from flytekit.tools.fast_registration import FastPackageOptions
from flytekit.tools.repo import NoSerializableEntitiesError, serialize_and_package


Expand Down
2 changes: 1 addition & 1 deletion flytekit/clis/sdk_in_container/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
from flytekit.clis.sdk_in_container.utils import domain_option_dec, project_option_dec
from flytekit.configuration import ImageConfig
from flytekit.configuration.default_images import DefaultImages
from flytekit.constants import CopyFileDetection
from flytekit.interaction.click_types import key_value_callback
from flytekit.loggers import logger
from flytekit.tools import repo
from flytekit.tools.fast_registration import CopyFileDetection

_register_help = """
This command is similar to ``package`` but instead of producing a zip file, all your Flyte entities are compiled,
Expand Down
3 changes: 2 additions & 1 deletion flytekit/clis/sdk_in_container/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
SerializationSettings,
)
from flytekit.configuration.plugin import get_plugin
from flytekit.constants import CopyFileDetection
from flytekit.core import context_manager
from flytekit.core.artifact import ArtifactQuery
from flytekit.core.base_task import PythonTask
Expand Down Expand Up @@ -66,7 +67,7 @@
)
from flytekit.remote.executions import FlyteWorkflowExecution
from flytekit.tools import module_loader
from flytekit.tools.fast_registration import CopyFileDetection, FastPackageOptions
from flytekit.tools.fast_registration import FastPackageOptions
from flytekit.tools.script_mode import _find_project_root, compress_scripts, get_all_modules
from flytekit.tools.translator import Options

Expand Down
14 changes: 14 additions & 0 deletions flytekit/constants/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from __future__ import annotations

from enum import Enum


class CopyFileDetection(Enum):
LOADED_MODULES = 1
ALL = 2
# This option's meaning will change in the future. In the future this will mean that no files should be copied
# (i.e. no fast registration is used). For now, both this value and setting this Enum to Python None are both
# valid to distinguish between users explicitly setting --copy none and not setting the flag.
# Currently, this is only used for register, not for package or run because run doesn't have a no-fast-register
# option and package is by default non-fast.
NO_COPY = 3
10 changes: 5 additions & 5 deletions flytekit/core/container_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from flytekit.core.context_manager import FlyteContext
from flytekit.core.interface import Interface
from flytekit.core.pod_template import PodTemplate
from flytekit.core.python_auto_container import get_registerable_container_image
from flytekit.core.python_auto_container import get_registerable_container_image, update_image_spec_copy_handling
from flytekit.core.resources import Resources, ResourceSpec
from flytekit.core.utils import _get_container_definition, _serialize_pod_spec
from flytekit.image_spec.image_spec import ImageSpec
Expand Down Expand Up @@ -279,10 +279,10 @@ def _get_data_loading_config(self) -> _task_model.DataLoadingConfig:
)

def _get_image(self, settings: SerializationSettings) -> str:
if settings.fast_serialization_settings is None or not settings.fast_serialization_settings.enabled:
if isinstance(self._image, ImageSpec):
# Set the source root for the image spec if it's non-fast registration
self._image.source_root = settings.source_root
"""Update image spec based on fast registration usage, and return string representing the image"""
if isinstance(self._image, ImageSpec):
update_image_spec_copy_handling(self._image, settings)

return get_registerable_container_image(self._image, settings.image_config)

def _get_container(self, settings: SerializationSettings) -> _task_model.Container:
Expand Down
40 changes: 36 additions & 4 deletions flytekit/core/python_auto_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from flyteidl.core import tasks_pb2

from flytekit.configuration import ImageConfig, SerializationSettings
from flytekit.constants import CopyFileDetection
from flytekit.core.base_task import PythonTask, TaskMetadata, TaskResolverMixin
from flytekit.core.context_manager import FlyteContextManager
from flytekit.core.pod_template import PodTemplate
Expand Down Expand Up @@ -185,10 +186,10 @@ def get_command(self, settings: SerializationSettings) -> List[str]:
return self._get_command_fn(settings)

def get_image(self, settings: SerializationSettings) -> str:
if settings.fast_serialization_settings is None or not settings.fast_serialization_settings.enabled:
if isinstance(self.container_image, ImageSpec):
# Set the source root for the image spec if it's non-fast registration
self.container_image.source_root = settings.source_root
"""Update image spec based on fast registration usage, and return string representing the image"""
if isinstance(self.container_image, ImageSpec):
update_image_spec_copy_handling(self.container_image, settings)

return get_registerable_container_image(self.container_image, settings.image_config)

def get_container(self, settings: SerializationSettings) -> _task_model.Container:
Expand Down Expand Up @@ -273,6 +274,37 @@ def get_all_tasks(self) -> List[PythonAutoContainerTask]: # type: ignore
default_task_resolver = DefaultTaskResolver()


def update_image_spec_copy_handling(image_spec: ImageSpec, settings: SerializationSettings):
"""
This helper function is where the relationship between fast register and ImageSpec is codified.
If fast register is not enabled, then source root is used and then files are copied.
See the copy option in ImageSpec for more information.
Currently the relationship is incidental. Because serialization settings are not passed into the image spec
build command (and it probably shouldn't be), the builder has no concept of which files to copy, when, and
from where. (or to where but that is hard-coded)
"""
# Handle when the copy method is explicitly set by the user.
if image_spec.source_copy_mode is not None:
if image_spec.source_copy_mode != CopyFileDetection.NO_COPY:
# if we need to copy any files, make sure source root is set. This preserves the behavior pre-copy arg,
# and allows the user to not have to specify source root.
if image_spec.source_root is None and settings.source_root is not None:
image_spec.source_root = settings.source_root

# Handle the default behavior of setting the behavior based on the inverse of fast register usage
# The default behavior additionally requires that serializa
elif settings.fast_serialization_settings is None or not settings.fast_serialization_settings.enabled:
# Set the source root for the image spec if it's non-fast registration
# Unfortunately whether the source_root/copy instructions should be set is implicitly dependent also on the
# existence of the source root in settings.
if settings.source_root is not None or image_spec.source_root is not None:
if image_spec.source_root is None:
image_spec.source_root = settings.source_root
if image_spec.source_copy_mode is None:
image_spec.source_copy_mode = CopyFileDetection.LOADED_MODULES


def get_registerable_container_image(img: Optional[Union[str, ImageSpec]], cfg: ImageConfig) -> str:
"""
Resolve the image to the real image name that should be used for registration.
Expand Down
30 changes: 23 additions & 7 deletions flytekit/image_spec/default_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@

import click

from flytekit.constants import CopyFileDetection
from flytekit.image_spec.image_spec import (
_F_IMG_ID,
ImageSpec,
ImageSpecBuilder,
)
from flytekit.tools.ignore import DockerIgnore, GitIgnore, IgnoreGroup, StandardIgnore
from flytekit.tools.script_mode import ls_files

UV_PYTHON_INSTALL_COMMAND_TEMPLATE = Template(
"""\
Expand Down Expand Up @@ -165,16 +167,28 @@ def create_docker_context(image_spec: ImageSpec, tmp_dir: Path):

apt_install_command = APT_INSTALL_COMMAND_TEMPLATE.substitute(APT_PACKAGES=" ".join(apt_packages))

if image_spec.source_root:
source_path = tmp_dir / "src"
if image_spec.source_copy_mode is not None and image_spec.source_copy_mode != CopyFileDetection.NO_COPY:
if not image_spec.source_root:
raise ValueError(f"Field source_root for {image_spec} must be set when copy is set")

source_path = tmp_dir / "src"
source_path.mkdir(parents=True, exist_ok=True)
# todo: See note in we should pipe through ignores from the command line here at some point.
# what about deref_symlink?
ignore = IgnoreGroup(image_spec.source_root, [GitIgnore, DockerIgnore, StandardIgnore])
shutil.copytree(
image_spec.source_root,
source_path,
ignore=shutil.ignore_patterns(*ignore.list_ignored()),
dirs_exist_ok=True,

ls, _ = ls_files(
str(image_spec.source_root), image_spec.source_copy_mode, deref_symlinks=False, ignore_group=ignore
)

for file_to_copy in ls:
rel_path = os.path.relpath(file_to_copy, start=str(image_spec.source_root))
Path(source_path / rel_path).parent.mkdir(parents=True, exist_ok=True)
shutil.copy(
file_to_copy,
source_path / rel_path,
)

copy_command_runtime = "COPY --chown=flytekit ./src /root"
else:
copy_command_runtime = ""
Expand Down Expand Up @@ -228,10 +242,12 @@ class DefaultImageBuilder(ImageSpecBuilder):
"""Image builder using Docker and buildkit."""

_SUPPORTED_IMAGE_SPEC_PARAMETERS: ClassVar[set] = {
"id",
"name",
"python_version",
"builder",
"source_root",
"copy",
"env",
"registry",
"packages",
Expand Down
43 changes: 38 additions & 5 deletions flytekit/image_spec/image_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import requests
from packaging.version import Version

from flytekit.constants import CopyFileDetection
from flytekit.exceptions.user import FlyteAssertion

DOCKER_HUB = "docker.io"
Expand Down Expand Up @@ -51,12 +52,20 @@ class ImageSpec:
commands: Command to run during the building process
tag_format: Custom string format for image tag. The ImageSpec hash passed in as `spec_hash`. For example,
to add a "dev" suffix to the image tag, set `tag_format="{spec_hash}-dev"`
source_copy_mode: This option allows the user to specify which source files to copy from the local host, into the image.
Not setting this option means to use the default flytekit behavior. The default behavior is:
- if fast register is used, source files are not copied into the image (because they're already copied
into the fast register tar layer).
- if fast register is not used, then the LOADED_MODULES (aka 'auto') option is used to copy loaded
Python files into the image.
If the option is set by the user, then that option is of course used.
"""

name: str = "flytekit"
python_version: str = None # Use default python in the base image if None.
builder: Optional[str] = None
source_root: Optional[str] = None
source_root: Optional[str] = None # a.txt:auto
env: Optional[typing.Dict[str, str]] = None
registry: Optional[str] = None
packages: Optional[List[str]] = None
Expand All @@ -74,13 +83,19 @@ class ImageSpec:
entrypoint: Optional[List[str]] = None
commands: Optional[List[str]] = None
tag_format: Optional[str] = None
source_copy_mode: Optional[CopyFileDetection] = None

def __post_init__(self):
self.name = self.name.lower()
self._is_force_push = os.environ.get(FLYTE_FORCE_PUSH_IMAGE_SPEC, False) # False by default
if self.registry:
self.registry = self.registry.lower()

# If not set, help the user set this option as well, to support the older default behavior where existence
# of the source root implied that copying of files was needed.
if self.source_root is not None:
self.source_copy_mode = self.source_copy_mode or CopyFileDetection.LOADED_MODULES

parameters_str_list = [
"packages",
"conda_channels",
Expand Down Expand Up @@ -109,6 +124,8 @@ def id(self) -> str:
- deduced abc: flyteorg/flytekit:123
- deduced xyz: flyteorg/flytekit:456
The result of this property also depends on whether or not update_image_spec_copy_handling was called.
:return: a unique identifier of the ImageSpec
"""
# Only get the non-None values in the ImageSpec to ensure the hash is consistent across different Flytekit versions.
Expand All @@ -125,24 +142,40 @@ def tag(self) -> str:
Calculate a hash from the image spec. The hash will be the tag of the image.
We will also read the content of the requirement file and the source root to calculate the hash.
Therefore, it will generate different hash if new dependencies are added or the source code is changed.
Keep in mind the fields source_root and copy may be changed by update_image_spec_copy_handling, so when
you call this property in relation to that function matter will change the output.
"""

# copy the image spec to avoid modifying the original image spec. otherwise, the hash will be different.
spec = copy.deepcopy(self)
if isinstance(spec.base_image, ImageSpec):
spec = dataclasses.replace(spec, base_image=spec.base_image.image_name())

if self.source_root:
from flytekit.tools.fast_registration import compute_digest
if self.source_copy_mode is not None and self.source_copy_mode != CopyFileDetection.NO_COPY:
if not self.source_root:
raise ValueError(f"Field source_root for image spec {self.name} must be set when copy is set")

# Imports of flytekit.tools are circular
from flytekit.tools.ignore import DockerIgnore, GitIgnore, IgnoreGroup, StandardIgnore
from flytekit.tools.script_mode import ls_files

# todo: we should pipe through ignores from the command line here at some point.
# what about deref_symlink?
ignore = IgnoreGroup(self.source_root, [GitIgnore, DockerIgnore, StandardIgnore])
digest = compute_digest(self.source_root, ignore.is_ignored)
spec = dataclasses.replace(spec, source_root=digest)

_, ls_digest = ls_files(
str(self.source_root), self.source_copy_mode, deref_symlinks=False, ignore_group=ignore
)

# Since the source root is supposed to represent the files, store the digest into the source root as a
# shortcut to represent all the files.
spec = dataclasses.replace(spec, source_root=ls_digest)

if spec.requirements:
requirements = hashlib.sha1(pathlib.Path(spec.requirements).read_bytes().strip()).hexdigest()
spec = dataclasses.replace(spec, requirements=requirements)

# won't rebuild the image if we change the registry_config path
spec = dataclasses.replace(spec, registry_config=None)
tag = spec.id.replace("-", "_")
Expand Down
23 changes: 2 additions & 21 deletions flytekit/tools/fast_registration.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,18 @@
import posixpath
import shutil
import subprocess
import sys
import tarfile
import tempfile
import time
import typing
from dataclasses import dataclass
from enum import Enum
from typing import Optional

import click
from rich import print as rich_print
from rich.tree import Tree

from flytekit.constants import CopyFileDetection
from flytekit.core.context_manager import FlyteContextManager
from flytekit.core.utils import timeit
from flytekit.exceptions.user import FlyteDataNotFoundException
Expand All @@ -31,17 +30,6 @@
FAST_FILEENDING = ".tar.gz"


class CopyFileDetection(Enum):
LOADED_MODULES = 1
ALL = 2
# This option's meaning will change in the future. In the future this will mean that no files should be copied
# (i.e. no fast registration is used). For now, both this value and setting this Enum to Python None are both
# valid to distinguish between users explicitly setting --copy none and not setting the flag.
# Currently, this is only used for register, not for package or run because run doesn't have a no-fast-register
# option and package is by default non-fast.
NO_COPY = 3


@dataclass(frozen=True)
class FastPackageOptions:
"""
Expand Down Expand Up @@ -130,14 +118,7 @@ def fast_package(
if options and (
options.copy_style == CopyFileDetection.LOADED_MODULES or options.copy_style == CopyFileDetection.ALL
):
if options.copy_style == CopyFileDetection.LOADED_MODULES:
# This is the 'auto' semantic by default used for pyflyte run, it only copies loaded .py files.
sys_modules = list(sys.modules.values())
ls, ls_digest = ls_files(str(source), sys_modules, deref_symlinks, ignore)
else:
# This triggers listing of all files, mimicking the old way of creating the tar file.
ls, ls_digest = ls_files(str(source), [], deref_symlinks, ignore)

ls, ls_digest = ls_files(str(source), options.copy_style, deref_symlinks, ignore)
logger.debug(f"Hash digest: {ls_digest}", fg="green")

if options.show_files:
Expand Down
Loading

0 comments on commit 26559fa

Please sign in to comment.