From 2dd53e7ae0adf7c8a5856bb86a287eddd591718d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 26 Aug 2024 17:30:19 +0200 Subject: [PATCH] Timeline archival test (#8824) This PR: * Implements the rule that archived timelines require all of their children to be archived as well, as specified in the RFC. There is no fancy locking mechanism though, so the precondition can still be broken. As a TODO for later, we still allow unarchiving timelines with archived parents. * Adds an `is_archived` flag to `TimelineInfo` * Adds timeline_archival_config to `PageserverHttpClient` * Adds a new `test_timeline_archive` test, loosely based on `test_timeline_delete` Part of #8088 --- libs/pageserver_api/src/models.rs | 1 + pageserver/src/http/routes.rs | 25 ++++- pageserver/src/tenant.rs | 70 ++++++++++++-- test_runner/fixtures/common_types.py | 7 ++ test_runner/fixtures/pageserver/http.py | 18 +++- test_runner/regress/test_timeline_archive.py | 96 ++++++++++++++++++++ 6 files changed, 207 insertions(+), 10 deletions(-) create mode 100644 test_runner/regress/test_timeline_archive.py diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index d55c06b685f6..4cab56771b8e 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -718,6 +718,7 @@ pub struct TimelineInfo { pub pg_version: u32, pub state: TimelineState, + pub is_archived: bool, pub walreceiver_status: String, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 4635e76ea914..cbcc162b325f 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -318,6 +318,24 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(value: crate::tenant::TimelineArchivalError) -> Self { + use crate::tenant::TimelineArchivalError::*; + match value { + NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()), + Timeout => ApiError::Timeout("hit pageserver internal timeout".into()), + HasUnarchivedChildren(children) => ApiError::PreconditionFailed( + format!( + "Cannot archive timeline which has non-archived child timelines: {children:?}" + ) + .into_boxed_str(), + ), + a @ AlreadyInProgress => ApiError::Conflict(a.to_string()), + Other(e) => ApiError::InternalServerError(e), + } + } +} + impl From for ApiError { fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self { use crate::tenant::mgr::DeleteTimelineError::*; @@ -405,6 +423,8 @@ async fn build_timeline_info_common( let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx); let current_physical_size = Some(timeline.layer_size_sum().await); let state = timeline.current_state(); + // Report is_archived = false if the timeline is still loading + let is_archived = timeline.is_archived().unwrap_or(false); let remote_consistent_lsn_projected = timeline .get_remote_consistent_lsn_projected() .unwrap_or(Lsn(0)); @@ -445,6 +465,7 @@ async fn build_timeline_info_common( pg_version: timeline.pg_version, state, + is_archived, walreceiver_status, @@ -686,9 +707,7 @@ async fn timeline_archival_config_handler( tenant .apply_timeline_archival_config(timeline_id, request_data.state) - .await - .context("applying archival config") - .map_err(ApiError::InternalServerError)?; + .await?; Ok::<_, ApiError>(()) } .instrument(info_span!("timeline_archival_config", diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 3a7afff21144..d3589a12c8c6 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -501,6 +501,38 @@ impl Debug for DeleteTimelineError { } } +#[derive(thiserror::Error)] +pub enum TimelineArchivalError { + #[error("NotFound")] + NotFound, + + #[error("Timeout")] + Timeout, + + #[error("HasUnarchivedChildren")] + HasUnarchivedChildren(Vec), + + #[error("Timeline archival is already in progress")] + AlreadyInProgress, + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl Debug for TimelineArchivalError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::NotFound => write!(f, "NotFound"), + Self::Timeout => write!(f, "Timeout"), + Self::HasUnarchivedChildren(c) => { + f.debug_tuple("HasUnarchivedChildren").field(c).finish() + } + Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(), + Self::Other(e) => f.debug_tuple("Other").field(e).finish(), + } + } +} + pub enum SetStoppingError { AlreadyStopping(completion::Barrier), Broken, @@ -1326,24 +1358,50 @@ impl Tenant { &self, timeline_id: TimelineId, state: TimelineArchivalState, - ) -> anyhow::Result<()> { - let timeline = self - .get_timeline(timeline_id, false) - .context("Cannot apply timeline archival config to inexistent timeline")?; + ) -> Result<(), TimelineArchivalError> { + info!("setting timeline archival config"); + let timeline = { + let timelines = self.timelines.lock().unwrap(); + + let timeline = match timelines.get(&timeline_id) { + Some(t) => t, + None => return Err(TimelineArchivalError::NotFound), + }; + + // Ensure that there are no non-archived child timelines + let children: Vec = timelines + .iter() + .filter_map(|(id, entry)| { + if entry.get_ancestor_timeline_id() != Some(timeline_id) { + return None; + } + if entry.is_archived() == Some(true) { + return None; + } + Some(*id) + }) + .collect(); + + if !children.is_empty() && state == TimelineArchivalState::Archived { + return Err(TimelineArchivalError::HasUnarchivedChildren(children)); + } + Arc::clone(timeline) + }; let upload_needed = timeline .remote_client .schedule_index_upload_for_timeline_archival_state(state)?; if upload_needed { + info!("Uploading new state"); const MAX_WAIT: Duration = Duration::from_secs(10); let Ok(v) = tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await else { tracing::warn!("reached timeout for waiting on upload queue"); - bail!("reached timeout for upload queue flush"); + return Err(TimelineArchivalError::Timeout); }; - v?; + v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?; } Ok(()) } diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 7cadcbb4c256..8eda19d1e2f0 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -1,5 +1,6 @@ import random from dataclasses import dataclass +from enum import Enum from functools import total_ordering from typing import Any, Dict, Type, TypeVar, Union @@ -213,3 +214,9 @@ def __eq__(self, other) -> bool: def __hash__(self) -> int: return hash(self._tuple()) + + +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class TimelineArchivalState(str, Enum): + ARCHIVED = "Archived" + UNARCHIVED = "Unarchived" diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index cd4261f1b84b..582f9c026439 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -10,7 +10,7 @@ from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry -from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pg_version import PgVersion @@ -621,6 +621,22 @@ def timeline_preserve_initdb_archive( ) self.verbose_error(res) + def timeline_archival_config( + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + state: TimelineArchivalState, + ): + config = {"state": state.value} + log.info( + f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}" + ) + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config", + json=config, + ) + self.verbose_error(res) + def timeline_get_lsn_by_timestamp( self, tenant_id: Union[TenantId, TenantShardId], diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py new file mode 100644 index 000000000000..b774c7c9fe6a --- /dev/null +++ b/test_runner/regress/test_timeline_archive.py @@ -0,0 +1,96 @@ +import pytest +from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId +from fixtures.neon_fixtures import ( + NeonEnv, +) +from fixtures.pageserver.http import PageserverApiException + + +def test_timeline_archive(neon_simple_env: NeonEnv): + env = neon_simple_env + + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was not found.*", + ".*timeline not found.*", + ".*Cannot archive timeline which has unarchived child timelines.*", + ".*Precondition failed: Requested tenant is missing.*", + ] + ) + + ps_http = env.pageserver.http_client() + + # first try to archive non existing timeline + # for existing tenant: + invalid_timeline_id = TimelineId.generate() + with pytest.raises(PageserverApiException, match="timeline not found") as exc: + ps_http.timeline_archival_config( + tenant_id=env.initial_tenant, + timeline_id=invalid_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + assert exc.value.status_code == 404 + + # for non existing tenant: + invalid_tenant_id = TenantId.generate() + with pytest.raises( + PageserverApiException, + match=f"NotFound: tenant {invalid_tenant_id}", + ) as exc: + ps_http.timeline_archival_config( + tenant_id=invalid_tenant_id, + timeline_id=invalid_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + assert exc.value.status_code == 404 + + # construct pair of branches to validate that pageserver prohibits + # archival of ancestor timelines when they have non-archived child branches + parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_archive_parent", "empty") + + leaf_timeline_id = env.neon_cli.create_branch( + "test_ancestor_branch_archive_branch1", "test_ancestor_branch_archive_parent" + ) + + timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id) + + with pytest.raises( + PageserverApiException, + match="Cannot archive timeline which has non-archived child timelines", + ) as exc: + assert timeline_path.exists() + + ps_http.timeline_archival_config( + tenant_id=env.initial_tenant, + timeline_id=parent_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + assert exc.value.status_code == 412 + + # Test timeline_detail + leaf_detail = ps_http.timeline_detail( + tenant_id=env.initial_tenant, + timeline_id=leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is False + + # Test that archiving the leaf timeline and then the parent works + ps_http.timeline_archival_config( + tenant_id=env.initial_tenant, + timeline_id=leaf_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + tenant_id=env.initial_tenant, + timeline_id=leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is True + + ps_http.timeline_archival_config( + tenant_id=env.initial_tenant, + timeline_id=parent_timeline_id, + state=TimelineArchivalState.ARCHIVED, + )