Skip to content

Commit

Permalink
Add DLQ support
Browse files Browse the repository at this point in the history
Add support for moving failed messages to deadletter queue in
Redis. The queue can be re-driven via an undocumented endpoint,
`/api/v1/admin/redrive-dlq`.
  • Loading branch information
jaymell committed Oct 8, 2024
1 parent 48ab14d commit 75f243a
Show file tree
Hide file tree
Showing 9 changed files with 197 additions and 11 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,10 @@ To support graceful shutdown on the server, all running tasks are finished befor
One of our main goals with open sourcing the Svix dispatcher is ease of use. The hosted Svix service, however, is quite complex due to our scale and the infrastructure it requires. This complexity is not useful for the vast majority of people and would make this project much harder to use and much more limited.
This is why this code has been adjusted before being released, and some of the features, optimizations, and behaviors supported by the hosted dispatcher are not yet available in this repo. With that being said, other than some known incompatibilities, the internal Svix test suite passes. This means they are already mostly compatible, and we are working hard on bringing them to full feature parity.

# Re-driving Redis DLQ
We have an undocumented endpoint for re-driving failed messages that are DLQ'ed. You can do this by calling `POST /api/v1/admin/redrive-dlq/`.

To monitor the DLQ depth, you should monitor the `svix.queue.depth_dlq` metric. Any non-zero values indicate that there is data in the DLQ.

# Development

Expand Down
7 changes: 7 additions & 0 deletions server/svix-server/src/cfg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ const DEFAULTS: &str = include_str!("../config.default.toml");

pub type Configuration = Arc<ConfigurationInner>;

fn default_redis_pending_duration_secs() -> u64 {
45
}

#[derive(Clone, Debug, Deserialize, Validate)]
#[validate(
schema(function = "validate_config_complete"),
Expand Down Expand Up @@ -200,6 +204,9 @@ pub struct ConfigurationInner {
#[serde(flatten)]
pub proxy_config: Option<ProxyConfig>,

#[serde(default = "default_redis_pending_duration_secs")]
pub redis_pending_duration_secs: u64,

#[serde(flatten)]
pub internal: InternalConfig,
}
Expand Down
21 changes: 21 additions & 0 deletions server/svix-server/src/metrics/redis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ pub struct RedisQueueMetrics {
main_queue: Option<ObservableGauge<u64>>,
pending_queue: Option<ObservableGauge<u64>>,
delayed_queue: Option<ObservableGauge<u64>>,
deadletter_queue: Option<ObservableGauge<u64>>,
}

impl RedisQueueMetrics {
Expand All @@ -65,10 +66,17 @@ impl RedisQueueMetrics {
.try_init(),
);

let deadletter_queue = init_metric(
meter
.u64_observable_gauge("svix.queue.depth_dlq")
.try_init(),
);

Self {
main_queue,
pending_queue,
delayed_queue,
deadletter_queue,
}
}
pub async fn record(
Expand All @@ -77,6 +85,7 @@ impl RedisQueueMetrics {
main_queue: &RedisQueueType<'_>,
pending_queue: &RedisQueueType<'_>,
delayed_queue: &RedisQueueType<'_>,
deadletter_queue: &RedisQueueType<'_>,
) {
main_queue
.queue_depth(redis)
Expand All @@ -99,6 +108,13 @@ impl RedisQueueMetrics {
.unwrap_or_else(|e| {
tracing::warn!("Failed to record queue depth: {e}");
});
deadletter_queue
.queue_depth(redis)
.await
.map(|d| self.record_deadletter_queue_depth(d))
.unwrap_or_else(|e| {
tracing::warn!("Failed to record queue depth: {e}");
});
}

fn record_main_queue_depth(&self, value: u64) {
Expand All @@ -116,4 +132,9 @@ impl RedisQueueMetrics {
recorder.observe(value, &[]);
}
}
fn record_deadletter_queue_depth(&self, value: u64) {
if let Some(recorder) = &self.deadletter_queue {
recorder.observe(value, &[]);
}
}
}
30 changes: 26 additions & 4 deletions server/svix-server/src/queue/redis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

use std::{num::NonZeroUsize, sync::Arc, time::Duration};

use omniqueue::backends::{RedisBackend, RedisConfig};
use omniqueue::backends::{redis::DeadLetterQueueConfig, RedisBackend, RedisConfig};
use redis::{AsyncCommands as _, RedisResult};

use super::{QueueTask, TaskQueueConsumer, TaskQueueProducer};
Expand All @@ -51,6 +51,9 @@ const DELAYED: &str = "{queue}_svix_delayed";
/// The key for the lock guarding the delayed queue background task.
const DELAYED_LOCK: &str = "{queue}_svix_delayed_lock";

/// The key for the DLQ
const DLQ: &str = "{queue}_svix_dlq";

// v2 KEY CONSTANTS
const LEGACY_V2_MAIN: &str = "{queue}_svix_main";
const LEGACY_V2_PROCESSING: &str = "{queue}_svix_processing";
Expand Down Expand Up @@ -82,11 +85,12 @@ pub async fn new_pair(
) -> (TaskQueueProducer, TaskQueueConsumer) {
new_pair_inner(
cfg,
Duration::from_secs(45),
Duration::from_secs(cfg.redis_pending_duration_secs),
prefix.unwrap_or_default(),
MAIN,
DELAYED,
DELAYED_LOCK,
DLQ,
)
.await
}
Expand Down Expand Up @@ -125,10 +129,12 @@ async fn new_pair_inner(
main_queue_name: &'static str,
delayed_queue_name: &'static str,
delayed_lock_name: &'static str,
dlq_name: &'static str,
) -> (TaskQueueProducer, TaskQueueConsumer) {
let main_queue_name = format!("{queue_prefix}{main_queue_name}");
let delayed_queue_name = format!("{queue_prefix}{delayed_queue_name}");
let delayed_lock_name = format!("{queue_prefix}{delayed_lock_name}");
let dlq_name = format!("{queue_prefix}{dlq_name}");

// This fn is only called from
// - `queue::new_pair` if the queue type is redis and a DSN is set
Expand Down Expand Up @@ -205,6 +211,7 @@ async fn new_pair_inner(
let pool = pool.clone();
let main_queue_name = main_queue_name.clone();
let delayed_queue_name = delayed_queue_name.clone();
let deadletter_queue_name = dlq_name.clone();

async move {
let mut interval = tokio::time::interval(Duration::from_secs(1));
Expand All @@ -214,12 +221,19 @@ async fn new_pair_inner(
group: WORKERS_GROUP,
};
let delayed_queue = RedisQueueType::SortedSet(&delayed_queue_name);
let deadletter_queue = RedisQueueType::List(&deadletter_queue_name);
let metrics =
crate::metrics::RedisQueueMetrics::new(&opentelemetry::global::meter("svix.com"));
loop {
interval.tick().await;
metrics
.record(&pool, &main_queue, &pending, &delayed_queue)
.record(
&pool,
&main_queue,
&pending,
&delayed_queue,
&deadletter_queue,
)
.await;
}
}
Expand All @@ -236,7 +250,10 @@ async fn new_pair_inner(
consumer_name: WORKER_CONSUMER.to_owned(),
payload_key: QUEUE_KV_KEY.to_owned(),
ack_deadline_ms: pending_duration,
dlq_config: None,
dlq_config: Some(DeadLetterQueueConfig {
queue_key: dlq_name.to_string(),
max_receives: 3,
}),
sentinel_config: cfg.redis_sentinel_cfg.clone().map(|c| c.into()),
};

Expand Down Expand Up @@ -502,6 +519,7 @@ pub mod tests {
"{test}_idle_period",
"{test}_idle_period_delayed",
"{test}_idle_period_delayed_lock",
"{test}_dlq",
)
.await;

Expand Down Expand Up @@ -572,6 +590,7 @@ pub mod tests {
"{test}_ack",
"{test}_ack_delayed",
"{test}_ack_delayed_lock",
"{test}_dlq",
)
.await;

Expand Down Expand Up @@ -618,6 +637,7 @@ pub mod tests {
"{test}_nack",
"{test}_nack_delayed",
"{test}_nack_delayed_lock",
"{test}_dlq",
)
.await;

Expand Down Expand Up @@ -661,6 +681,7 @@ pub mod tests {
"{test}_delay",
"{test}_delay_delayed",
"{test}_delay_delayed_lock",
"{test}_dlq",
)
.await;

Expand Down Expand Up @@ -834,6 +855,7 @@ pub mod tests {
v3_main,
v2_delayed,
v2_delayed_lock,
"dlq-bruh",
)
.await;

Expand Down
28 changes: 28 additions & 0 deletions server/svix-server/src/v1/endpoints/admin.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
use aide::{
axum::{routing::post_with, ApiRouter},
transform::TransformPathItem,
};
use axum::extract::State;
use svix_server_derive::aide_annotate;

use crate::{core::permissions, error::Result, v1::utils::NoContent, AppState};

/// Redrive DLQ
#[aide_annotate(op_id = "v1.admin.redrive-dlq")]
pub async fn redrive_dlq(
State(AppState { queue_tx, .. }): State<AppState>,
_: permissions::Organization,
) -> Result<NoContent> {
if let Err(e) = queue_tx.redrive_dlq().await {
tracing::warn!(error = ?e, "DLQ redrive failed");
}
Ok(NoContent)
}

pub fn router() -> ApiRouter<AppState> {
ApiRouter::new().api_route_with(
"/admin/redrive-dlq",
post_with(redrive_dlq, redrive_dlq_operation),
move |op: TransformPathItem<'_>| op.tag("Admin".as_ref()).hidden(true),
)
}
1 change: 1 addition & 0 deletions server/svix-server/src/v1/endpoints/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// SPDX-FileCopyrightText: © 2022 Svix Authors
// SPDX-License-Identifier: MIT

pub mod admin;
pub mod application;
pub mod attempt;
pub mod auth;
Expand Down
1 change: 1 addition & 0 deletions server/svix-server/src/v1/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ pub fn router() -> ApiRouter<AppState> {
.merge(endpoints::event_type::router())
.merge(endpoints::message::router())
.merge(endpoints::attempt::router())
.merge(endpoints::admin::router())
.layer(
TraceLayer::new_for_http()
.make_span_with(AxumOtelSpanCreator)
Expand Down
99 changes: 98 additions & 1 deletion server/svix-server/tests/it/redis_queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,27 @@

use std::{str::FromStr, time::Duration};

use http::StatusCode;
use redis::AsyncCommands as _;
use svix_ksuid::KsuidLike;
use svix_server::{
cfg::Configuration,
core::types::{ApplicationId, EndpointId, MessageAttemptTriggerType, MessageId},
core::types::{
ApplicationId, BaseId, EndpointId, MessageAttemptTriggerType, MessageId, OrganizationId,
},
queue::{
new_pair, MessageTask, QueueTask, TaskQueueConsumer, TaskQueueDelivery, TaskQueueProducer,
},
redis::RedisManager,
v1::endpoints::message::MessageOut,
};
use tokio::time::timeout;

use crate::utils::{
common_calls::{create_test_app, create_test_endpoint, message_in},
get_default_test_config, start_svix_server_with_cfg_and_org_id_and_prefix,
};

// TODO: Don't copy this from the Redis queue test directly, place the fn somewhere both can access
async fn get_pool(cfg: &Configuration) -> RedisManager {
RedisManager::from_queue_backend(&cfg.queue_backend(), cfg.redis_pool_max_size).await
Expand Down Expand Up @@ -148,3 +158,90 @@ async fn test_many_queue_consumers_delayed() {
)
.await;
}

#[tokio::test]
#[ignore]
async fn test_redis_streams_dlq() {
let mut cfg = get_default_test_config();
cfg.worker_enabled = false;
cfg.redis_pending_duration_secs = 1;

let cfg = std::sync::Arc::new(cfg);
let prefix = svix_ksuid::Ksuid::new(None, None).to_string();

let pool = get_pool(&cfg).await;
let mut conn = pool.get().await.unwrap();

let _: () = conn
.del(format!("{prefix}{{queue}}_svix_v3_main"))
.await
.unwrap();

let _: () = conn
.del(format!("{prefix}{{queue}}_svix_dlq"))
.await
.unwrap();

let (client, _jh) = start_svix_server_with_cfg_and_org_id_and_prefix(
&cfg,
OrganizationId::new(None, None),
prefix.clone(),
)
.await;

let app_id = create_test_app(&client, "v1MessageCRTestApp")
.await
.unwrap()
.id;

let _endp_id = create_test_endpoint(&client, &app_id, "http://localhost:2/bad/url/")
.await
.unwrap()
.id;

let _message_1: MessageOut = client
.post(
&format!("api/v1/app/{app_id}/msg/"),
message_in(&app_id, serde_json::json!({"test": "value"})).unwrap(),
StatusCode::ACCEPTED,
)
.await
.unwrap();

let (_p, mut c) = new_pair(&cfg, Some(&prefix)).await;

let wait_time = std::time::Duration::from_millis(1_500);
for _ in 0..3 {
let res = c.receive_all(wait_time).await.unwrap();
assert!(!res.is_empty());
for j in res {
j.nack().await.unwrap();
}
}

let res = c.receive_all(wait_time).await.unwrap();
assert!(res.is_empty());

tokio::time::sleep(wait_time).await;

// Redrive
client
.post_without_response(
"/api/v1/admin/redrive-dlq",
serde_json::Value::Null,
StatusCode::NO_CONTENT,
)
.await
.unwrap();

for _ in 0..3 {
let res = c.receive_all(wait_time).await.unwrap();
assert!(!res.is_empty());
for j in res {
j.nack().await.unwrap();
}
}

let res = c.receive_all(wait_time).await.unwrap();
assert!(res.is_empty());
}
Loading

0 comments on commit 75f243a

Please sign in to comment.