Skip to content

Commit

Permalink
[Serve] Update log pattern in _follow_replica_logs for new UX 3.0 (#…
Browse files Browse the repository at this point in the history
…4333)

* refactor: generalize log streaming from `serve_utils._follow_logs` into centralized `log_utils.follow_logs`

* fix: no yield if its a provision log line

* refactor: naming and if-indent

* fix: adapt to new ux

* fix: prevent recursion in serve log following

* refactor: simplify log expanding match and improve naming

* revert removing redundant checking as per reviewer's input

* naming
  • Loading branch information
andylizf authored Nov 14, 2024
1 parent 1c04aef commit 53871cb
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 30 deletions.
1 change: 1 addition & 0 deletions sky/serve/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,7 @@ def tail_logs(
with ux_utils.print_exception_no_traceback():
raise ValueError(f'`target` must be a string or '
f'sky.serve.ServiceComponent, got {type(target)}.')

if target == serve_utils.ServiceComponent.REPLICA:
if replica_id is None:
with ux_utils.print_exception_no_traceback():
Expand Down
65 changes: 35 additions & 30 deletions sky/serve/serve_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,14 @@
constants.CONTROLLER_MEMORY_USAGE_GB)
_CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'

_SKYPILOT_PROVISION_LOG_PATTERN = r'.*tail -n100 -f (.*provision\.log).*'
_SKYPILOT_LOG_PATTERN = r'.*tail -n100 -f (.*\.log).*'
# NOTE(dev): We assume log paths are either in ~/sky_logs/... or ~/.sky/...
# and always appear after a space. Be careful when changing UX as this
# assumption is used to expand some log files while ignoring others.
_SKYPILOT_LOG_DIRS = r'~/(sky_logs|\.sky)'
_SKYPILOT_PROVISION_LOG_PATTERN = (
fr'.* ({_SKYPILOT_LOG_DIRS}/.*provision\.log)')
_SKYPILOT_LOG_PATTERN = fr'.* ({_SKYPILOT_LOG_DIRS}/.*\.log)'

# TODO(tian): Find all existing replica id and print here.
_FAILED_TO_FIND_REPLICA_MSG = (
f'{colorama.Fore.RED}Failed to find replica '
Expand Down Expand Up @@ -591,15 +597,15 @@ def get_latest_version_with_min_replicas(
return active_versions[-1] if active_versions else None


def _follow_replica_logs(
def _follow_logs_with_provision_expanding(
file: TextIO,
cluster_name: str,
*,
should_stop: Callable[[], bool],
stop_on_eof: bool = False,
idle_timeout_seconds: Optional[int] = None,
) -> Iterator[str]:
"""Follows logs for a replica, handling nested log files.
"""Follows logs and expands any provision.log references found.
Args:
file: Log file to read from.
Expand All @@ -610,7 +616,7 @@ def _follow_replica_logs(
new content.
Yields:
Log lines from the main file and any nested log files.
Log lines, including expanded content from referenced provision logs.
"""

def cluster_is_up() -> bool:
Expand All @@ -620,36 +626,35 @@ def cluster_is_up() -> bool:
return cluster_record['status'] == status_lib.ClusterStatus.UP

def process_line(line: str) -> Iterator[str]:
# Tailing detailed progress for user. All logs in skypilot is
# of format `To view detailed progress: tail -n100 -f *.log`.
# Check if the line is directing users to view logs
# The line might be directing users to view logs, like
# `✓ Cluster launched: new-http. View logs at: *.log`
# We should tail the detailed logs for user.
provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
other_log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)

if provision_log_prompt is not None:
nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
# We still exit if more than 10 seconds without new content
# to avoid any internal bug that causes the launch to fail
# while cluster status remains INIT.
# Originally, we output the next line first before printing
# the launching logs. Since the next line is always
# `Launching on <cloud> <region> (<zone>)`, we output it first
# to indicate the process is starting.
# TODO(andyl): After refactor #4323, the above logic is broken,
# but coincidentally with the new UX 3.0, the `Cluster launched`
# message is printed first, making the output appear correct.
# Explaining this since it's technically a breaking change
# for this refactor PR #4323. Will remove soon in a fix PR
# for adapting the serve.follow_logs to the new UX.
yield from _follow_replica_logs(f,
cluster_name,
should_stop=cluster_is_up,
stop_on_eof=stop_on_eof,
idle_timeout_seconds=10)

try:
with open(nested_log_path, 'r', newline='',
encoding='utf-8') as f:
# We still exit if more than 10 seconds without new content
# to avoid any internal bug that causes the launch to fail
# while cluster status remains INIT.
yield from log_utils.follow_logs(f,
should_stop=cluster_is_up,
stop_on_eof=stop_on_eof,
idle_timeout_seconds=10)
except FileNotFoundError:
yield line

yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
f'Try to expand log file {nested_log_path} but not '
f'found. Skipping...{colorama.Style.RESET_ALL}')
pass
return

if other_log_prompt is not None:
if log_prompt is not None:
# Now we skip other logs (file sync logs) since we lack
# utility to determine when these log files are finished
# writing.
Expand Down Expand Up @@ -702,7 +707,7 @@ def _get_replica_status() -> serve_state.ReplicaStatus:
replica_provisioned = (
lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
for line in _follow_replica_logs(
for line in _follow_logs_with_provision_expanding(
f,
replica_cluster_name,
should_stop=replica_provisioned,
Expand Down

0 comments on commit 53871cb

Please sign in to comment.