Skip to content

Commit

Permalink
[graphql] Improve the health check (#16016)
Browse files Browse the repository at this point in the history
## Description 

Improve the health check by removing the query to the DB and replacing
it with a status code 200 if all is OK, or 500 on error. It also cleans
up the previous health check checks in the logging logic.

It also moves metrics and connection under a `AppState` struct as `axum`
does not allow states with different types on the router, so we need to
bundle them under one state and then have `FromRef` implementations for
the substates.

## Test Plan 

Existing tests, manually analyzing the logs.

---
If your changes are not user-facing and do not break anything, you can
skip the following section. Otherwise, please briefly describe what has
changed under the Release Notes section.

### Type of Change (Check all that apply)

- [ ] protocol change
- [ ] user-visible impact
- [ ] breaking change for a client SDKs
- [ ] breaking change for FNs (FN binary must upgrade)
- [ ] breaking change for validators or node operators (must upgrade
binaries)
- [ ] breaking change for on-chain data layout
- [ ] necessitate either a data wipe or data migration

### Release notes
  • Loading branch information
stefan-mysten authored Jan 31, 2024
1 parent 2677541 commit 5a7c5b3
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 63 deletions.
11 changes: 3 additions & 8 deletions crates/sui-graphql-rpc/src/extensions/logger.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ impl Extension for LoggerExtension {
.any(|(_, operation)| operation.node.selection_set.node.items.iter().any(|selection| matches!(&selection.node, Selection::Field(field) if field.node.name.node == "__schema")));
let query_id: &Uuid = ctx.data_unchecked();
let session_id: &SocketAddr = ctx.data_unchecked();
if !is_schema && self.config.log_request_query && !is_health_check_request(query_id) {
if !is_schema && self.config.log_request_query {
info!(
%query_id,
%session_id,
Expand All @@ -103,7 +103,7 @@ impl Extension for LoggerExtension {
let res = next.run(ctx).await?;
let query_id: &Uuid = ctx.data_unchecked();
let session_id: &SocketAddr = ctx.data_unchecked();
if self.config.log_complexity && !is_health_check_request(query_id) {
if self.config.log_complexity {
info!(
%query_id,
%session_id,
Expand Down Expand Up @@ -187,7 +187,7 @@ impl Extension for LoggerExtension {
)
}
}
} else if self.config.log_response && !is_health_check_request(query_id) {
} else if self.config.log_response {
match operation_name {
Some("IntrospectionQuery") => {
debug!(
Expand All @@ -206,8 +206,3 @@ impl Extension for LoggerExtension {
resp
}
}

/// Check if the request comes from the `health` endpoint, which has the nil uuid
fn is_health_check_request(id: &Uuid) -> bool {
id.is_nil()
}
131 changes: 76 additions & 55 deletions crates/sui-graphql-rpc/src/server/builder.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
// Copyright (c) Mysten Labs, Inc.
// SPDX-License-Identifier: Apache-2.0

use crate::config::{MAX_CONCURRENT_REQUESTS, RPC_TIMEOUT_ERR_SLEEP_RETRY_PERIOD};
use crate::config::{
ConnectionConfig, MAX_CONCURRENT_REQUESTS, RPC_TIMEOUT_ERR_SLEEP_RETRY_PERIOD,
};
use crate::context_data::package_cache::DbPackageStore;
use crate::data::Db;

use crate::metrics::Metrics;
use crate::mutation::Mutation;
use crate::types::move_object::IMoveObject;
Expand All @@ -27,8 +30,9 @@ use async_graphql::extensions::Tracing;
use async_graphql::EmptySubscription;
use async_graphql::{extensions::ExtensionFactory, Schema, SchemaBuilder};
use async_graphql_axum::{GraphQLRequest, GraphQLResponse};
use axum::extract::FromRef;
use axum::extract::{connect_info::IntoMakeServiceWithConnectInfo, ConnectInfo, State};
use axum::http::HeaderMap;
use axum::http::{HeaderMap, StatusCode};
use axum::middleware::{self};
use axum::response::IntoResponse;
use axum::routing::{post, MethodRouter, Route};
Expand All @@ -38,6 +42,7 @@ use hyper::server::conn::AddrIncoming as HyperAddrIncoming;
use hyper::Body;
use hyper::Server as HyperServer;
use std::convert::Infallible;
use std::net::TcpStream;
use std::{any::Any, net::SocketAddr, time::Instant};
use sui_package_resolver::{PackageStoreWithLruCache, Resolver};
use sui_sdk::SuiClientBuilder;
Expand All @@ -46,8 +51,6 @@ use tower::{Layer, Service};
use tracing::{info, warn};
use uuid::Uuid;

const HEALTH_UUID: Uuid = Uuid::nil();

pub struct Server {
pub server: HyperServer<HyperAddrIncoming, IntoMakeServiceWithConnectInfo<Router, SocketAddr>>,
}
Expand All @@ -62,27 +65,52 @@ impl Server {
}

pub(crate) struct ServerBuilder {
port: u16,
host: String,

state: AppState,
schema: SchemaBuilder<Query, Mutation, EmptySubscription>,
router: Option<Router>,
}

#[derive(Clone)]
pub(crate) struct AppState {
connection: ConnectionConfig,
metrics: Metrics,
}

impl AppState {
fn new(connection: ConnectionConfig, metrics: Metrics) -> Self {
Self {
connection,
metrics,
}
}
}

impl FromRef<AppState> for ConnectionConfig {
fn from_ref(app_state: &AppState) -> ConnectionConfig {
app_state.connection.clone()
}
}

impl FromRef<AppState> for Metrics {
fn from_ref(app_state: &AppState) -> Metrics {
app_state.metrics.clone()
}
}

impl ServerBuilder {
pub fn new(port: u16, host: String, metrics: Metrics) -> Self {
pub fn new(state: AppState) -> Self {
Self {
port,
host,
state,
schema: schema_builder(),
router: None,
metrics,
}
}

pub fn address(&self) -> String {
format!("{}:{}", self.host, self.port)
format!(
"{}:{}",
self.state.connection.host, self.state.connection.port
)
}

pub fn context_data(mut self, context_data: impl Any + Send + Sync) -> Self {
Expand Down Expand Up @@ -115,9 +143,9 @@ impl ServerBuilder {
.route("/", post(graphql_handler))
.route("/graphql", post(graphql_handler))
.route("/health", axum::routing::get(health_checks))
.with_state(self.metrics.clone())
.with_state(self.state.clone())
.route_layer(middleware::from_fn_with_state(
self.metrics.clone(),
self.state.metrics.clone(),
check_version_middleware,
))
.layer(middleware::from_fn(set_version_middleware));
Expand Down Expand Up @@ -185,12 +213,8 @@ impl ServerBuilder {

// METRICS
let metrics = Metrics::new(&registry);

let mut builder = ServerBuilder::new(
config.connection.port,
config.connection.host.clone(),
metrics.clone(),
);
let state = AppState::new(config.connection.clone(), metrics.clone());
let mut builder = ServerBuilder::new(state);

let name_service_config = config.name_service.clone();
let reader = PgManager::reader_with_config(
Expand Down Expand Up @@ -300,36 +324,27 @@ async fn graphql_handler(
result.into()
}

async fn health_checks(
ConnectInfo(addr): ConnectInfo<SocketAddr>,
schema: axum::Extension<SuiGraphQLSchema>,
req: GraphQLRequest,
) -> impl axum::response::IntoResponse {
// Simple request to check if the DB is up
// TODO: add more checks and figure out better ways to do these health checks
let mut req = req.into_inner();
req.data.insert(addr);
// insert the NIL UUID which helps avoid logging these health requests
req.data.insert(HEALTH_UUID);
req.query = r#"
query {
chainIdentifier
}
"#
.to_string();
let db_up = match schema.execute(req).await.is_ok() {
true => "UP",
false => "DOWN",
/// Connect via a TCPStream to the DB to check if it is alive
async fn health_checks(State(connection): State<ConnectionConfig>) -> StatusCode {
let Ok(url) = reqwest::Url::parse(connection.db_url.as_str()) else {
return StatusCode::INTERNAL_SERVER_ERROR;
};
let uptime = get_or_init_server_start_time()
.await
.elapsed()
.as_secs_f64();
format!(
r#"{{"status": "UP","uptime": {},"checks": {{"DB": "{}",}}}}
"#,
uptime, db_up
)

let Some(host) = url.host_str() else {
return StatusCode::INTERNAL_SERVER_ERROR;
};

let tcp_url = if let Some(port) = url.port() {
format!("{host}:{port}")
} else {
host.to_string()
};

if TcpStream::connect(tcp_url).is_err() {
StatusCode::INTERNAL_SERVER_ERROR
} else {
StatusCode::OK
}
}

// One server per proc, so this is okay
Expand Down Expand Up @@ -433,7 +448,8 @@ pub mod tests {
let metrics = metrics();
let db = Db::new(reader.clone(), cfg.limits, metrics.clone());
let pg_conn_pool = PgManager::new(reader);
let schema = ServerBuilder::new(8000, "127.0.0.1".to_string(), metrics)
let state = AppState::new(connection_config.clone(), metrics.clone());
let schema = ServerBuilder::new(state)
.context_data(db)
.context_data(pg_conn_pool)
.context_data(cfg)
Expand Down Expand Up @@ -487,7 +503,8 @@ pub mod tests {
let metrics = metrics();
let db = Db::new(reader.clone(), service_config.limits, metrics.clone());
let pg_conn_pool = PgManager::new(reader);
let schema = ServerBuilder::new(8000, "127.0.0.1".to_string(), metrics.clone())
let state = AppState::new(connection_config.clone(), metrics.clone());
let schema = ServerBuilder::new(state)
.context_data(db)
.context_data(pg_conn_pool)
.context_data(service_config)
Expand Down Expand Up @@ -560,7 +577,8 @@ pub mod tests {
let metrics = metrics();
let db = Db::new(reader.clone(), service_config.limits, metrics.clone());
let pg_conn_pool = PgManager::new(reader);
let schema = ServerBuilder::new(8000, "127.0.0.1".to_string(), metrics.clone())
let state = AppState::new(connection_config.clone(), metrics.clone());
let schema = ServerBuilder::new(state)
.context_data(db)
.context_data(pg_conn_pool)
.context_data(service_config)
Expand Down Expand Up @@ -633,7 +651,8 @@ pub mod tests {
let reader = PgManager::reader(db_url).expect("Failed to create pg connection pool");
let db = Db::new(reader.clone(), service_config.limits, metrics.clone());
let pg_conn_pool = PgManager::new(reader);
let schema = ServerBuilder::new(8000, "127.0.0.1".to_string(), metrics.clone())
let state = AppState::new(connection_config.clone(), metrics.clone());
let schema = ServerBuilder::new(state)
.context_data(db)
.context_data(pg_conn_pool)
.context_data(service_config)
Expand Down Expand Up @@ -686,7 +705,8 @@ pub mod tests {
let metrics = metrics();
let db = Db::new(reader.clone(), service_config.limits, metrics.clone());
let pg_conn_pool = PgManager::new(reader);
let schema = ServerBuilder::new(8000, "127.0.0.1".to_string(), metrics.clone())
let state = AppState::new(connection_config.clone(), metrics.clone());
let schema = ServerBuilder::new(state)
.context_data(db)
.context_data(pg_conn_pool)
.context_data(service_config)
Expand Down Expand Up @@ -728,7 +748,8 @@ pub mod tests {
let reader = PgManager::reader(db_url).expect("Failed to create pg connection pool");
let db = Db::new(reader.clone(), service_config.limits, metrics.clone());
let pg_conn_pool = PgManager::new(reader);
let schema = ServerBuilder::new(8000, "127.0.0.1".to_string(), metrics.clone())
let state = AppState::new(connection_config.clone(), metrics.clone());
let schema = ServerBuilder::new(state)
.context_data(db)
.context_data(pg_conn_pool)
.context_data(service_config)
Expand Down

0 comments on commit 5a7c5b3

Please sign in to comment.