Skip to content

Commit

Permalink
feat: Add initial version prover_autoscaler (#2993)
Browse files Browse the repository at this point in the history
## What ❔

Add zksync_prover_autoscaler, which collects data, but only reports
metrics instead of actual scaling.

<!-- What are the changes this PR brings about? -->
<!-- Example: This PR adds a PR template to the repo. -->
<!-- (For bigger PRs adding more context is appreciated) -->

## Why ❔

First step in creating fast global prover autoscaler.
<!-- Why are these changes done? What goal do they contribute to? What
are the principles behind them? -->
<!-- Example: PR templates ensure PR reviewers, observers, and future
iterators are in context about the evolution of repos. -->

## Checklist

<!-- Check your PR fulfills the following items. -->
<!-- For draft PRs check the boxes as you complete them. -->

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [x] Tests for the changes have been added / updated.
- [ ] Documentation comments have been added / updated.
- [x] Code has been formatted via `zk fmt` and `zk lint`.
  • Loading branch information
yorik authored Oct 9, 2024
1 parent 25112df commit ebf9604
Show file tree
Hide file tree
Showing 25 changed files with 2,060 additions and 103 deletions.
5 changes: 5 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

163 changes: 82 additions & 81 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,85 +1,85 @@
[workspace]
members = [
# Binaries
"core/bin/block_reverter",
"core/bin/contract-verifier",
"core/bin/external_node",
"core/bin/merkle_tree_consistency_checker",
"core/bin/snapshots_creator",
"core/bin/selector_generator",
"core/bin/system-constants-generator",
"core/bin/verified_sources_fetcher",
"core/bin/zksync_server",
"core/bin/genesis_generator",
"core/bin/zksync_tee_prover",
# Node services
"core/node/node_framework",
"core/node/proof_data_handler",
"core/node/block_reverter",
"core/node/commitment_generator",
"core/node/house_keeper",
"core/node/genesis",
"core/node/shared_metrics",
"core/node/db_pruner",
"core/node/fee_model",
"core/node/da_dispatcher",
"core/node/eth_sender",
"core/node/vm_runner",
"core/node/test_utils",
"core/node/state_keeper",
"core/node/reorg_detector",
"core/node/consistency_checker",
"core/node/metadata_calculator",
"core/node/node_sync",
"core/node/node_storage_init",
"core/node/consensus",
"core/node/contract_verification_server",
"core/node/api_server",
"core/node/tee_verifier_input_producer",
"core/node/base_token_adjuster",
"core/node/external_proof_integration_api",
"core/node/logs_bloom_backfill",
"core/node/da_clients",
# Libraries
"core/lib/db_connection",
"core/lib/zksync_core_leftovers",
"core/lib/basic_types",
"core/lib/config",
"core/lib/constants",
"core/lib/contract_verifier",
"core/lib/contracts",
"core/lib/circuit_breaker",
"core/lib/dal",
"core/lib/env_config",
"core/lib/da_client",
"core/lib/eth_client",
"core/lib/eth_signer",
"core/lib/l1_contract_interface",
"core/lib/mempool",
"core/lib/merkle_tree",
"core/lib/mini_merkle_tree",
"core/lib/node_framework_derive",
"core/lib/object_store",
"core/lib/prover_interface",
"core/lib/queued_job_processor",
"core/lib/state",
"core/lib/storage",
"core/lib/tee_verifier",
"core/lib/types",
"core/lib/protobuf_config",
"core/lib/utils",
"core/lib/vlog",
"core/lib/multivm",
"core/lib/vm_interface",
"core/lib/vm_executor",
"core/lib/web3_decl",
"core/lib/snapshots_applier",
"core/lib/crypto_primitives",
"core/lib/external_price_api",
# Test infrastructure
"core/tests/test_account",
"core/tests/loadnext",
"core/tests/vm-benchmark",
# Binaries
"core/bin/block_reverter",
"core/bin/contract-verifier",
"core/bin/external_node",
"core/bin/merkle_tree_consistency_checker",
"core/bin/snapshots_creator",
"core/bin/selector_generator",
"core/bin/system-constants-generator",
"core/bin/verified_sources_fetcher",
"core/bin/zksync_server",
"core/bin/genesis_generator",
"core/bin/zksync_tee_prover",
# Node services
"core/node/node_framework",
"core/node/proof_data_handler",
"core/node/block_reverter",
"core/node/commitment_generator",
"core/node/house_keeper",
"core/node/genesis",
"core/node/shared_metrics",
"core/node/db_pruner",
"core/node/fee_model",
"core/node/da_dispatcher",
"core/node/eth_sender",
"core/node/vm_runner",
"core/node/test_utils",
"core/node/state_keeper",
"core/node/reorg_detector",
"core/node/consistency_checker",
"core/node/metadata_calculator",
"core/node/node_sync",
"core/node/node_storage_init",
"core/node/consensus",
"core/node/contract_verification_server",
"core/node/api_server",
"core/node/tee_verifier_input_producer",
"core/node/base_token_adjuster",
"core/node/external_proof_integration_api",
"core/node/logs_bloom_backfill",
"core/node/da_clients",
# Libraries
"core/lib/db_connection",
"core/lib/zksync_core_leftovers",
"core/lib/basic_types",
"core/lib/config",
"core/lib/constants",
"core/lib/contract_verifier",
"core/lib/contracts",
"core/lib/circuit_breaker",
"core/lib/dal",
"core/lib/env_config",
"core/lib/da_client",
"core/lib/eth_client",
"core/lib/eth_signer",
"core/lib/l1_contract_interface",
"core/lib/mempool",
"core/lib/merkle_tree",
"core/lib/mini_merkle_tree",
"core/lib/node_framework_derive",
"core/lib/object_store",
"core/lib/prover_interface",
"core/lib/queued_job_processor",
"core/lib/state",
"core/lib/storage",
"core/lib/tee_verifier",
"core/lib/types",
"core/lib/protobuf_config",
"core/lib/utils",
"core/lib/vlog",
"core/lib/multivm",
"core/lib/vm_interface",
"core/lib/vm_executor",
"core/lib/web3_decl",
"core/lib/snapshots_applier",
"core/lib/crypto_primitives",
"core/lib/external_price_api",
# Test infrastructure
"core/tests/test_account",
"core/tests/loadnext",
"core/tests/vm-benchmark",
]
resolver = "2"

Expand Down Expand Up @@ -172,6 +172,7 @@ sqlx = "0.8.1"
static_assertions = "1.1"
structopt = "0.3.20"
strum = "0.26"
strum_macros = "0.26.4"
tempfile = "3.0.2"
test-casing = "0.1.2"
test-log = "0.2.15"
Expand All @@ -185,7 +186,7 @@ tower-http = "0.5.2"
tracing = "0.1"
tracing-subscriber = "0.3"
tracing-opentelemetry = "0.25.0"
time = "0.3.36" # Has to be same as used by `tracing-subscriber`
time = "0.3.36" # Has to be same as used by `tracing-subscriber`
url = "2"
web3 = "0.19.0"
fraction = "0.15.3"
Expand Down
4 changes: 4 additions & 0 deletions core/lib/config/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ anyhow.workspace = true
rand.workspace = true
secrecy.workspace = true
serde = { workspace = true, features = ["derive"] }
time = { workspace = true, features = ["serde-human-readable"] }
strum.workspace = true
strum_macros.workspace = true
vise.workspace = true

[dev-dependencies]
serde_json.workspace = true
Expand Down
1 change: 1 addition & 0 deletions core/lib/config/src/configs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ pub mod house_keeper;
pub mod object_store;
pub mod observability;
pub mod proof_data_handler;
pub mod prover_autoscaler;
pub mod prover_job_monitor;
pub mod pruning;
pub mod secrets;
Expand Down
117 changes: 117 additions & 0 deletions core/lib/config/src/configs/prover_autoscaler.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
use std::collections::HashMap;

use serde::Deserialize;
use strum::Display;
use strum_macros::EnumString;
use time::Duration;
use vise::EncodeLabelValue;

use crate::configs::ObservabilityConfig;

/// Config used for running ProverAutoscaler (both Scaler and Agent).
#[derive(Debug, Clone, PartialEq)]
pub struct ProverAutoscalerConfig {
/// Amount of time ProverJobMonitor will wait all it's tasks to finish.
// TODO: find a way to use #[serde(with = "humantime_serde")] with time::Duration.
pub graceful_shutdown_timeout: Duration,
pub agent_config: Option<ProverAutoscalerAgentConfig>,
pub scaler_config: Option<ProverAutoscalerScalerConfig>,
pub observability: Option<ObservabilityConfig>,
}

#[derive(Debug, Clone, PartialEq, Deserialize)]
pub struct ProverAutoscalerAgentConfig {
/// Port for prometheus metrics connection.
pub prometheus_port: u16,
/// HTTP port for global Scaler to connect to the Agent running in a cluster.
pub http_port: u16,
/// List of namespaces to watch.
#[serde(default = "ProverAutoscalerAgentConfig::default_namespaces")]
pub namespaces: Vec<String>,
/// Watched cluster name. Also can be set via flag.
pub cluster_name: Option<String>,
}

#[derive(Debug, Clone, PartialEq, Deserialize, Default)]
pub struct ProverAutoscalerScalerConfig {
/// Port for prometheus metrics connection.
pub prometheus_port: u16,
/// The interval between runs for global Scaler.
#[serde(default = "ProverAutoscalerScalerConfig::default_scaler_run_interval")]
pub scaler_run_interval: Duration,
/// URL to get queue reports from.
/// In production should be "http://prover-job-monitor.stage2.svc.cluster.local:3074/queue_report".
#[serde(default = "ProverAutoscalerScalerConfig::default_prover_job_monitor_url")]
pub prover_job_monitor_url: String,
/// List of ProverAutoscaler Agents to get cluster data from.
pub agents: Vec<String>,
/// Mapping of namespaces to protocol versions.
pub protocol_versions: HashMap<String, String>,
/// Default priorities, which cluster to prefer when there is no other information.
pub cluster_priorities: HashMap<String, u32>,
/// Prover speed per GPU. Used to calculate desired number of provers for queue size.
pub prover_speed: HashMap<Gpu, u32>,
/// Duration after which pending pod considered long pending.
#[serde(default = "ProverAutoscalerScalerConfig::default_long_pending_duration")]
pub long_pending_duration: Duration,
}

#[derive(
Default,
Debug,
Display,
Hash,
PartialEq,
Eq,
Clone,
Copy,
Ord,
PartialOrd,
EnumString,
EncodeLabelValue,
Deserialize,
)]
pub enum Gpu {
#[default]
Unknown,
#[strum(ascii_case_insensitive)]
L4,
#[strum(ascii_case_insensitive)]
T4,
#[strum(ascii_case_insensitive)]
V100,
#[strum(ascii_case_insensitive)]
P100,
#[strum(ascii_case_insensitive)]
A100,
}

impl ProverAutoscalerConfig {
/// Default graceful shutdown timeout -- 5 seconds
pub fn default_graceful_shutdown_timeout() -> Duration {
Duration::seconds(5)
}
}

impl ProverAutoscalerAgentConfig {
pub fn default_namespaces() -> Vec<String> {
vec!["prover-blue".to_string(), "prover-red".to_string()]
}
}

impl ProverAutoscalerScalerConfig {
/// Default scaler_run_interval -- 10s
pub fn default_scaler_run_interval() -> Duration {
Duration::seconds(10)
}

/// Default prover_job_monitor_url -- cluster local URL
pub fn default_prover_job_monitor_url() -> String {
"http://localhost:3074/queue_report".to_string()
}

/// Default long_pending_duration -- 10m
pub fn default_long_pending_duration() -> Duration {
Duration::minutes(10)
}
}
1 change: 1 addition & 0 deletions core/lib/protobuf_config/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ rand.workspace = true
hex.workspace = true
secrecy.workspace = true
tracing.workspace = true
time.workspace = true

[build-dependencies]
zksync_protobuf_build.workspace = true
1 change: 1 addition & 0 deletions core/lib/protobuf_config/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ mod observability;
mod proof_data_handler;
pub mod proto;
mod prover;
mod prover_autoscaler;
mod prover_job_monitor;
mod pruning;
mod secrets;
Expand Down
Loading

0 comments on commit ebf9604

Please sign in to comment.