[Benchmark] fix writing benchmark output in workflow; adjust latency …

…calculations (MystenLabs#6987) Avoid truncating benchmark output since there are more rows in the results and we use regex to extract the results anyway. Example failure: https://github.com/MystenLabs/sui/actions/runs/3753653913/jobs/6377112297#step:9:26 Example new output: MystenLabs@c14f11c#commitcomment-93874326 Write per-header and per-certificate latencies at debug level outside of benchmarks as well. Increase number of buckets for the range of latencies we are interested in.
qiwihui · Dec 22, 2022 · cebf8ff · cebf8ff
1 parent bb14ffd
commit cebf8ff
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 26 deletions.
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -72,7 +72,7 @@ jobs:
         set -o pipefail
         cargo run --release --package sui-benchmark --bin stress -- --log-path /tmp/stress.log --num-client-threads 10 --num-server-threads 24 --num-transfer-accounts 2 bench --target-qps 100 --num-workers 10  --transfer-object 100 --run-duration 60s 2>&1 | huniq | tee -a artifacts/owned.txt
         cargo run --release --package sui-benchmark --bin stress -- --log-path /tmp/stress.log --num-client-threads 10 --num-server-threads 24 --num-transfer-accounts 2 bench --target-qps 100 --num-workers 10  --shared-counter 100 --run-duration 60s 2>&1 | huniq | tee -a artifacts/shared.txt
-        pushd narwhal/benchmark && fab local | tail -n 31 | tee -a ../../artifacts/narwhal.txt && popd
+        pushd narwhal/benchmark && fab local | tee -a ../../artifacts/narwhal.txt && popd
 
     - name: Retrieve benchmark results
       id: get-comment-body

diff --git a/narwhal/consensus/src/consensus.rs b/narwhal/consensus/src/consensus.rs
@@ -183,9 +183,8 @@ impl ConsensusState {
             .with_label_values(&[])
             .set(last_committed_round as i64);
 
-        #[cfg(feature = "benchmark")]
         // NOTE: This log entry is used to compute performance.
-        tracing::info!(
+        tracing::debug!(
             "Certificate {:?} took {} seconds to be committed at round {}",
             certificate.digest(),
             certificate.metadata.created_at.elapsed().as_secs_f64(),

diff --git a/narwhal/primary/src/core.rs b/narwhal/primary/src/core.rs
@@ -483,9 +483,8 @@ impl Core {
             .with_label_values(&[&epoch.to_string()])
             .observe(header_to_certificate_duration);
 
-        #[cfg(feature = "benchmark")]
         // NOTE: This log entry is used to compute performance.
-        tracing::info!(
+        debug!(
             "Header {:?} took {} seconds to be materialized to a certificate {:?}",
             certificate.header.digest(),
             header_to_certificate_duration,

diff --git a/narwhal/primary/src/metrics.rs b/narwhal/primary/src/metrics.rs
@@ -14,8 +14,9 @@ use std::time::Duration;
 use tonic::Code;
 
 const LATENCY_SEC_BUCKETS: &[f64] = &[
-    0.05, 0.1, 0.25, 0.5, 1., 2.5, 5., 7.5, 10., 12.5, 15., 20., 25., 30., 60., 90., 120., 180.,
-    300.,
+    0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.2, 1.4,
+    1.6, 1.8, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5, 10.,
+    12.5, 15., 17.5, 20., 25., 30., 60., 90., 120., 180., 300.,
 ];
 
 #[derive(Clone)]

diff --git a/narwhal/primary/src/proposer.rs b/narwhal/primary/src/proposer.rs
@@ -241,9 +241,11 @@ impl Proposer {
         self.proposed_headers.insert(this_round, header.clone());
 
         // Update metrics related to latency
+        let mut total_inclusion_secs = 0.0;
         for (_digest, _worker_id, created_at_timestamp) in digests.clone() {
-            let batch_inclusion_duration =
+            let batch_inclusion_secs =
                 Duration::from_millis(header.created_at - created_at_timestamp).as_secs_f64();
+            total_inclusion_secs += batch_inclusion_secs;
 
             #[cfg(feature = "benchmark")]
             {
@@ -252,21 +254,30 @@ impl Proposer {
                     "Batch {:?} from worker {} took {} seconds from creation to be included in a proposed header",
                     _digest,
                     _worker_id,
-                    batch_inclusion_duration
+                    batch_inclusion_secs
                 );
             }
 
             self.metrics
                 .proposer_batch_latency
-                .observe(batch_inclusion_duration);
+                .observe(batch_inclusion_secs);
         }
 
-        #[cfg(feature = "benchmark")]
         // NOTE: This log entry is used to compute performance.
-        tracing::info!(
-            "Header {:?} was created in {} seconds",
+        let (header_creation_secs, avg_inclusion_secs) = if let Some(digest) = digests.first() {
+            (
+                Duration::from_millis(header.created_at - digest.2).as_secs_f64(),
+                total_inclusion_secs / digests.len() as f64,
+            )
+        } else {
+            (self.max_header_delay.as_secs_f64(), 0.0)
+        };
+        debug!(
+            "Header {:?} was created in {} seconds. Contains {} batches, with average delay {} seconds.",
             header.digest(),
-            Duration::from_millis(header.created_at - digests.first().unwrap().2).as_secs_f64()
+            header_creation_secs,
+            digests.len(),
+            avg_inclusion_secs,
         );
 
         Ok(header)

diff --git a/narwhal/worker/src/batch_maker.rs b/narwhal/worker/src/batch_maker.rs
@@ -56,8 +56,8 @@ pub struct BatchMaker {
     tx_message: Sender<(Batch, Option<tokio::sync::oneshot::Sender<()>>)>,
     /// Metrics handler
     node_metrics: Arc<WorkerMetrics>,
-    /// The timestamp of the first transaction received
-    /// to be included on the next batch
+    /// The timestamp of the batch creation.
+    /// Average resident time in the batch would be ~ (batch seal time - creation time) / 2
     batch_start_timestamp: Instant,
     /// The batch store to store our own batches.
     store: Store<BatchDigest, Batch>,
@@ -119,15 +119,6 @@ impl BatchMaker {
                 // 'in-flight' are below a certain number (MAX_PARALLEL_BATCH). This
                 // condition will be met eventually if the store and network are functioning.
                 Some((transaction, response_sender)) = self.rx_batch_maker.recv(), if batch_pipeline.len() < MAX_PARALLEL_BATCH => {
-
-                    if current_batch.transactions.is_empty() {
-                        // We are interested to measure the time to seal a batch
-                        // only when we do have transactions to include. Thus we reset
-                        // the timer on the first transaction we receive to include on
-                        // an empty batch.
-                        self.batch_start_timestamp = Instant::now();
-                    }
-
                     current_batch_size += transaction.len();
                     current_batch.transactions.push(transaction);
                     current_responses.push(response_sender);
@@ -137,10 +128,12 @@ impl BatchMaker {
                         }
                         self.node_metrics.parallel_worker_batches.set(batch_pipeline.len() as i64);
 
-                        timer.as_mut().reset(Instant::now() + self.max_batch_delay);
                         current_batch = Batch::default();
                         current_responses = Vec::new();
                         current_batch_size = 0;
+
+                        timer.as_mut().reset(Instant::now() + self.max_batch_delay);
+                        self.batch_start_timestamp = Instant::now();
                     }
                 },
 
@@ -157,6 +150,7 @@ impl BatchMaker {
                         current_batch_size = 0;
                     }
                     timer.as_mut().reset(Instant::now() + self.max_batch_delay);
+                    self.batch_start_timestamp = Instant::now();
                 }
 
                 // TODO: duplicated code in quorum_waiter.rs