Skip to content

Commit

Permalink
Timeline archival test (#8824)
Browse files Browse the repository at this point in the history
This PR:

* Implements the rule that archived timelines require all of their
children to be archived as well, as specified in the RFC. There is no
fancy locking mechanism though, so the precondition can still be broken.
As a TODO for later, we still allow unarchiving timelines with archived
parents.
* Adds an `is_archived` flag to `TimelineInfo`
* Adds timeline_archival_config to `PageserverHttpClient`
* Adds a new `test_timeline_archive` test, loosely based on
`test_timeline_delete`

Part of #8088
  • Loading branch information
arpad-m authored Aug 26, 2024
1 parent d6eede5 commit 2dd53e7
Show file tree
Hide file tree
Showing 6 changed files with 207 additions and 10 deletions.
1 change: 1 addition & 0 deletions libs/pageserver_api/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,7 @@ pub struct TimelineInfo {
pub pg_version: u32,

pub state: TimelineState,
pub is_archived: bool,

pub walreceiver_status: String,

Expand Down
25 changes: 22 additions & 3 deletions pageserver/src/http/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,24 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
}
}

impl From<crate::tenant::TimelineArchivalError> for ApiError {
fn from(value: crate::tenant::TimelineArchivalError) -> Self {
use crate::tenant::TimelineArchivalError::*;
match value {
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
format!(
"Cannot archive timeline which has non-archived child timelines: {children:?}"
)
.into_boxed_str(),
),
a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
Other(e) => ApiError::InternalServerError(e),
}
}
}

impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
use crate::tenant::mgr::DeleteTimelineError::*;
Expand Down Expand Up @@ -405,6 +423,8 @@ async fn build_timeline_info_common(
let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
let current_physical_size = Some(timeline.layer_size_sum().await);
let state = timeline.current_state();
// Report is_archived = false if the timeline is still loading
let is_archived = timeline.is_archived().unwrap_or(false);
let remote_consistent_lsn_projected = timeline
.get_remote_consistent_lsn_projected()
.unwrap_or(Lsn(0));
Expand Down Expand Up @@ -445,6 +465,7 @@ async fn build_timeline_info_common(
pg_version: timeline.pg_version,

state,
is_archived,

walreceiver_status,

Expand Down Expand Up @@ -686,9 +707,7 @@ async fn timeline_archival_config_handler(

tenant
.apply_timeline_archival_config(timeline_id, request_data.state)
.await
.context("applying archival config")
.map_err(ApiError::InternalServerError)?;
.await?;
Ok::<_, ApiError>(())
}
.instrument(info_span!("timeline_archival_config",
Expand Down
70 changes: 64 additions & 6 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,38 @@ impl Debug for DeleteTimelineError {
}
}

#[derive(thiserror::Error)]
pub enum TimelineArchivalError {
#[error("NotFound")]
NotFound,

#[error("Timeout")]
Timeout,

#[error("HasUnarchivedChildren")]
HasUnarchivedChildren(Vec<TimelineId>),

#[error("Timeline archival is already in progress")]
AlreadyInProgress,

#[error(transparent)]
Other(#[from] anyhow::Error),
}

impl Debug for TimelineArchivalError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::NotFound => write!(f, "NotFound"),
Self::Timeout => write!(f, "Timeout"),
Self::HasUnarchivedChildren(c) => {
f.debug_tuple("HasUnarchivedChildren").field(c).finish()
}
Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
}
}
}

pub enum SetStoppingError {
AlreadyStopping(completion::Barrier),
Broken,
Expand Down Expand Up @@ -1326,24 +1358,50 @@ impl Tenant {
&self,
timeline_id: TimelineId,
state: TimelineArchivalState,
) -> anyhow::Result<()> {
let timeline = self
.get_timeline(timeline_id, false)
.context("Cannot apply timeline archival config to inexistent timeline")?;
) -> Result<(), TimelineArchivalError> {
info!("setting timeline archival config");
let timeline = {
let timelines = self.timelines.lock().unwrap();

let timeline = match timelines.get(&timeline_id) {
Some(t) => t,
None => return Err(TimelineArchivalError::NotFound),
};

// Ensure that there are no non-archived child timelines
let children: Vec<TimelineId> = timelines
.iter()
.filter_map(|(id, entry)| {
if entry.get_ancestor_timeline_id() != Some(timeline_id) {
return None;
}
if entry.is_archived() == Some(true) {
return None;
}
Some(*id)
})
.collect();

if !children.is_empty() && state == TimelineArchivalState::Archived {
return Err(TimelineArchivalError::HasUnarchivedChildren(children));
}
Arc::clone(timeline)
};

let upload_needed = timeline
.remote_client
.schedule_index_upload_for_timeline_archival_state(state)?;

if upload_needed {
info!("Uploading new state");
const MAX_WAIT: Duration = Duration::from_secs(10);
let Ok(v) =
tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
else {
tracing::warn!("reached timeout for waiting on upload queue");
bail!("reached timeout for upload queue flush");
return Err(TimelineArchivalError::Timeout);
};
v?;
v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
}
Ok(())
}
Expand Down
7 changes: 7 additions & 0 deletions test_runner/fixtures/common_types.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import random
from dataclasses import dataclass
from enum import Enum
from functools import total_ordering
from typing import Any, Dict, Type, TypeVar, Union

Expand Down Expand Up @@ -213,3 +214,9 @@ def __eq__(self, other) -> bool:

def __hash__(self) -> int:
return hash(self._tuple())


# TODO: Replace with `StrEnum` when we upgrade to python 3.11
class TimelineArchivalState(str, Enum):
ARCHIVED = "Archived"
UNARCHIVED = "Unarchived"
18 changes: 17 additions & 1 deletion test_runner/fixtures/pageserver/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId
from fixtures.log_helper import log
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
from fixtures.pg_version import PgVersion
Expand Down Expand Up @@ -621,6 +621,22 @@ def timeline_preserve_initdb_archive(
)
self.verbose_error(res)

def timeline_archival_config(
self,
tenant_id: Union[TenantId, TenantShardId],
timeline_id: TimelineId,
state: TimelineArchivalState,
):
config = {"state": state.value}
log.info(
f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}"
)
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config",
json=config,
)
self.verbose_error(res)

def timeline_get_lsn_by_timestamp(
self,
tenant_id: Union[TenantId, TenantShardId],
Expand Down
96 changes: 96 additions & 0 deletions test_runner/regress/test_timeline_archive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import pytest
from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId
from fixtures.neon_fixtures import (
NeonEnv,
)
from fixtures.pageserver.http import PageserverApiException


def test_timeline_archive(neon_simple_env: NeonEnv):
env = neon_simple_env

env.pageserver.allowed_errors.extend(
[
".*Timeline .* was not found.*",
".*timeline not found.*",
".*Cannot archive timeline which has unarchived child timelines.*",
".*Precondition failed: Requested tenant is missing.*",
]
)

ps_http = env.pageserver.http_client()

# first try to archive non existing timeline
# for existing tenant:
invalid_timeline_id = TimelineId.generate()
with pytest.raises(PageserverApiException, match="timeline not found") as exc:
ps_http.timeline_archival_config(
tenant_id=env.initial_tenant,
timeline_id=invalid_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)

assert exc.value.status_code == 404

# for non existing tenant:
invalid_tenant_id = TenantId.generate()
with pytest.raises(
PageserverApiException,
match=f"NotFound: tenant {invalid_tenant_id}",
) as exc:
ps_http.timeline_archival_config(
tenant_id=invalid_tenant_id,
timeline_id=invalid_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)

assert exc.value.status_code == 404

# construct pair of branches to validate that pageserver prohibits
# archival of ancestor timelines when they have non-archived child branches
parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_archive_parent", "empty")

leaf_timeline_id = env.neon_cli.create_branch(
"test_ancestor_branch_archive_branch1", "test_ancestor_branch_archive_parent"
)

timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id)

with pytest.raises(
PageserverApiException,
match="Cannot archive timeline which has non-archived child timelines",
) as exc:
assert timeline_path.exists()

ps_http.timeline_archival_config(
tenant_id=env.initial_tenant,
timeline_id=parent_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)

assert exc.value.status_code == 412

# Test timeline_detail
leaf_detail = ps_http.timeline_detail(
tenant_id=env.initial_tenant,
timeline_id=leaf_timeline_id,
)
assert leaf_detail["is_archived"] is False

# Test that archiving the leaf timeline and then the parent works
ps_http.timeline_archival_config(
tenant_id=env.initial_tenant,
timeline_id=leaf_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
leaf_detail = ps_http.timeline_detail(
tenant_id=env.initial_tenant,
timeline_id=leaf_timeline_id,
)
assert leaf_detail["is_archived"] is True

ps_http.timeline_archival_config(
tenant_id=env.initial_tenant,
timeline_id=parent_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)

1 comment on commit 2dd53e7

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3861 tests run: 3745 passed, 0 failed, 116 skipped (full report)


Flaky tests (2)

Postgres 14

Code coverage* (full report)

  • functions: 32.1% (7256 of 22571 functions)
  • lines: 50.3% (58806 of 116980 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
2dd53e7 at 2024-08-26T17:21:00.008Z :recycle:

Please sign in to comment.