Skip to content
This repository has been archived by the owner on Nov 15, 2023. It is now read-only.

Commit

Permalink
Retry failed PVF execution
Browse files Browse the repository at this point in the history
PVF execution that fails due to AmbiguousWorkerDeath should be retried once.
This should reduce the occurrence of failures due to transient conditions.

Closes #6195
  • Loading branch information
mrcnski committed Nov 3, 2022
1 parent 2e39830 commit 2e91ff3
Showing 1 changed file with 38 additions and 20 deletions.
58 changes: 38 additions & 20 deletions node/core/candidate-validation/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ use polkadot_node_subsystem::{
overseer, FromOrchestra, OverseerSignal, SpawnedSubsystem, SubsystemError, SubsystemResult,
SubsystemSender,
};
use polkadot_node_subsystem_util::TimeoutExt;
use polkadot_parachain::primitives::{ValidationParams, ValidationResult as WasmValidationResult};
use polkadot_primitives::v2::{
CandidateCommitments, CandidateDescriptor, CandidateReceipt, Hash, OccupiedCoreAssumption,
Expand All @@ -60,6 +61,9 @@ mod tests;

const LOG_TARGET: &'static str = "parachain::candidate-validation";

// The amount of time to wait before retrying after an AmbiguousWorkerDeath validation error.
const PVF_EXECUTION_RETRY_DELAY: Duration = Duration::from_secs(1);

/// Configuration for the candidate validation subsystem
#[derive(Clone)]
pub struct Config {
Expand Down Expand Up @@ -621,28 +625,21 @@ impl ValidationBackend for ValidationHost {
timeout: Duration,
params: ValidationParams,
) -> Result<WasmValidationResult, ValidationError> {
let (tx, rx) = oneshot::channel();
if let Err(err) = self
.execute_pvf(
Pvf::from_code(raw_validation_code),
timeout,
params.encode(),
polkadot_node_core_pvf::Priority::Normal,
tx,
)
.await
{
return Err(ValidationError::InternalError(format!(
"cannot send pvf to the validation host: {:?}",
err
)))
}
let pvf = Pvf::from_code(raw_validation_code);

let validation_result = rx
.await
.map_err(|_| ValidationError::InternalError("validation was cancelled".into()))?;
let validation_result = execute_pvf_once(self, pvf.clone(), timeout, params.encode()).await;

validation_result
// If we get an AmbiguousWorkerDeath error, retry once after a brief delay, on the
// assumption that the conditions that caused this error may have been transient.
if let Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)) =
validation_result
{
// Wait a brief delay before retrying.
let _: Option<()> = future::pending().timeout(PVF_EXECUTION_RETRY_DELAY).await;
execute_pvf_once(self, pvf, timeout, params.encode()).await
} else {
validation_result
}
}

async fn precheck_pvf(&mut self, pvf: Pvf) -> Result<(), PrepareError> {
Expand All @@ -657,6 +654,27 @@ impl ValidationBackend for ValidationHost {
}
}

// Tries executing a PVF a single time (no retries).
async fn execute_pvf_once(
host: &mut ValidationHost,
pvf: Pvf,
timeout: Duration,
params: Vec<u8>,
) -> Result<WasmValidationResult, ValidationError> {
let priority = polkadot_node_core_pvf::Priority::Normal;

let (tx, rx) = oneshot::channel();
if let Err(err) = host.execute_pvf(pvf, timeout, params, priority, tx).await {
return Err(ValidationError::InternalError(format!(
"cannot send pvf to the validation host: {:?}",
err
)))
}

rx.await
.map_err(|_| ValidationError::InternalError("validation was cancelled".into()))?
}

/// Does basic checks of a candidate. Provide the encoded PoV-block. Returns `Ok` if basic checks
/// are passed, `Err` otherwise.
fn perform_basic_checks(
Expand Down

0 comments on commit 2e91ff3

Please sign in to comment.