onflow · gomisha · Oct 12, 2023 · Sep 12, 2023 · Sep 20, 2023 · Sep 20, 2023
diff --git a/engine/common/synchronization/config.go b/engine/common/synchronization/config.go
@@ -46,6 +46,12 @@ const spamProbabilityMultiplier = 1001
 // message types.
 type SpamDetectionConfig struct {
 
+	// batchRequestBaseProb is the base probability that's used in creating the final probability of creating a
+	// misbehavior report for a BatchRequest message. This is why the word "base" is used in the name of this field,
+	// since it's not the final probability and there are other factors that determine the final probability.
+	// The reason for this is that we want to increase the probability of creating a misbehavior report for a large batch.
+	batchRequestBaseProb float32
+
 	// syncRequestProb is the probability of creating a misbehavior report for a SyncRequest message.
 	syncRequestProb float32
 

diff --git a/engine/common/synchronization/engine.go b/engine/common/synchronization/engine.go
@@ -208,7 +208,7 @@ func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, eve
 func (e *Engine) process(channel channels.Channel, originID flow.Identifier, event interface{}) error {
 	switch message := event.(type) {
 	case *messages.BatchRequest:
-		report, valid, err := e.validateBatchRequestForALSP(channel, originID, message)
+		report, valid, err := e.validateBatchRequestForALSP(originID, message)
 		if err != nil {
 			return fmt.Errorf("failed to validate batch request from %x: %w", originID[:], err)
 		}
@@ -509,8 +509,72 @@ func (e *Engine) sendRequests(participants flow.IdentifierList, ranges []chainsy
 	}
 }
 
-// TODO: implement spam reporting similar to validateSyncRequestForALSP
-func (e *Engine) validateBatchRequestForALSP(channel channels.Channel, id flow.Identifier, batchRequest *messages.BatchRequest) (*alsp.MisbehaviorReport, bool, error) {
+// validateBatchRequestForALSP checks if a batch request should be reported as a misbehavior due to malicious intent (e.g. spamming).
+// It returns a misbehavior report and a boolean indicating whether validation passed, as well as an error.
+// Returns an error that is assumed to be irrecoverable because of internal processes that didn't allow validation to complete.
+// Returns true if the batch request is valid and should not be reported as misbehavior.
+// Returns false if either
+// a) the batch request is invalid or
+// b) the batch request is valid but should be reported as misbehavior anyway (due to probabilities) or
+// c) an error is encountered.
+func (e *Engine) validateBatchRequestForALSP(originID flow.Identifier, batchRequest *messages.BatchRequest) (*alsp.MisbehaviorReport, bool, error) {
+	// Generate a random integer between 1 and spamProbabilityMultiplier (exclusive)
+	n, err := rand.Uint32n(spamProbabilityMultiplier)
+
+	if err != nil {
+		return nil, false, fmt.Errorf("failed to generate random number from %x: %w", originID[:], err)
+	}
+
+	// validity check: if no block IDs, always report as misbehavior
+	if len(batchRequest.BlockIDs) == 0 {
+		e.log.Warn().
+			Hex("origin_id", logging.ID(originID)).
+			Str(logging.KeySuspicious, "true").
+			Str("reason", alsp.InvalidMessage.String()).
+			Msg("received invalid batch request with 0 block IDs, creating ALSP report")
+		report, err := alsp.NewMisbehaviorReport(originID, alsp.InvalidMessage)
+
+		if err != nil {
+			// failing to create the misbehavior report is unlikely. If an error is encountered while
+			// creating the misbehavior report it indicates a bug and processing can not proceed.
+			return nil, false, fmt.Errorf("failed to create misbehavior report (invalid batch request, no block IDs) from %x: %w", originID[:], err)
+		}
+		// failed validation check and should be reported as misbehavior
+		return report, false, nil
+	}
+
+	// to avoid creating a misbehavior report for every batch request received, use a probabilistic approach.
+	// The larger the batch request and base probability, the higher the probability of creating a misbehavior report.
+
+	// batchRequestProb is calculated as follows:
+	// batchRequestBaseProb * (len(batchRequest.BlockIDs) + 1) / synccore.DefaultConfig().MaxSize
+	// Example 1 (small batch of block IDs) if the batch request is for 10 blocks IDs and batchRequestBaseProb is 0.01, then the probability of
+	// creating a misbehavior report is:
+	// batchRequestBaseProb * (10+1) / synccore.DefaultConfig().MaxSize
+	// = 0.01 * 11 / 64 = 0.00171875 = 0.171875%
+	// Example 2 (large batch of block IDs) if the batch request is for 1000 block IDs and batchRequestBaseProb is 0.01, then the probability of
+	// creating a misbehavior report is:
+	// batchRequestBaseProb * (1000+1) / synccore.DefaultConfig().MaxSize
+	// = 0.01 * 1001 / 64 = 0.15640625 = 15.640625%
+	batchRequestProb := e.spamDetectionConfig.batchRequestBaseProb * (float32(len(batchRequest.BlockIDs)) + 1) / float32(synccore.DefaultConfig().MaxSize)
+	if float32(n) < batchRequestProb*spamProbabilityMultiplier {
+		// create a misbehavior report
+		e.log.Warn().
+			Hex("origin_id", logging.ID(originID)).
+			Str(logging.KeySuspicious, "true").
+			Str("reason", alsp.ResourceIntensiveRequest.String()).
+			Msgf("for %d block IDs, creating probabilistic ALSP report", len(batchRequest.BlockIDs))
+		report, err := alsp.NewMisbehaviorReport(originID, alsp.ResourceIntensiveRequest)
+
+		if err != nil {
+			// failing to create the misbehavior report is unlikely. If an error is encountered while
+			// creating the misbehavior report it indicates a bug and processing can not proceed.
+			return nil, false, fmt.Errorf("failed to create misbehavior report from %x: %w", originID[:], err)
+		}
+		// failed validation check and should be reported as misbehavior
+		return report, false, nil
+	}
+
 	return nil, true, nil
 }
 
@@ -519,11 +583,14 @@ func (e *Engine) validateBlockResponseForALSP(channel channels.Channel, id flow.
 	return nil, true, nil
 }
 
-// validateRangeRequestForALSP checks if a range request should be reported as a misbehavior.
+// validateRangeRequestForALSP checks if a range request should be reported as a misbehavior due to malicious intent (e.g. spamming).
 // It returns a misbehavior report and a boolean indicating whether validation passed, as well as an error.
 // Returns an error that is assumed to be irrecoverable because of internal processes that didn't allow validation to complete.
 // Returns true if the range request is valid and should not be reported as misbehavior.
-// Returns false if either a) the range request is invalid or b) the range request is valid but should be reported as misbehavior anyway (due to probabilities) or c) an error is encountered.
+// Returns false if either
+// a) the range request is invalid or
+// b) the range request is valid but should be reported as misbehavior anyway (due to probabilities) or
+// c) an error is encountered.
 func (e *Engine) validateRangeRequestForALSP(originID flow.Identifier, rangeRequest *messages.RangeRequest) (*alsp.MisbehaviorReport, bool, error) {
 	// Generate a random integer between 1 and spamProbabilityMultiplier (exclusive)
 	n, err := rand.Uint32n(spamProbabilityMultiplier)
@@ -586,11 +653,13 @@ func (e *Engine) validateRangeRequestForALSP(originID flow.Identifier, rangeRequ
 	return nil, true, nil
 }
 
-// validateSyncRequestForALSP checks if a sync request should be reported as a misbehavior.
+// validateSyncRequestForALSP checks if a sync request should be reported as a misbehavior due to malicious intent (e.g. spamming).
 // It returns a misbehavior report and a boolean indicating whether validation passed, as well as an error.
 // Returns an error that is assumed to be irrecoverable because of internal processes that didn't allow validation to complete.
 // Returns true if passed validation.
-// Returns false if either a) failed validation (due to probabilities) or b) an error is encountered.
+// Returns false if either
+// a) failed validation (due to probabilities) or
+// b) an error is encountered.
 func (e *Engine) validateSyncRequestForALSP(originID flow.Identifier) (*alsp.MisbehaviorReport, bool, error) {
 	// Generate a random integer between 1 and spamProbabilityMultiplier (exclusive)
 	n, err := rand.Uint32n(spamProbabilityMultiplier)

diff --git a/engine/common/synchronization/engine_spam_test.go b/engine/common/synchronization/engine_spam_test.go
@@ -6,6 +6,8 @@ import (
 	"testing"
 	"time"
 
+	"github.com/onflow/flow-go/model/flow"
+
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/require"
@@ -268,3 +270,106 @@ func (ss *SyncSuite) TestLoad_Process_RangeRequest_SometimesReportSpam() {
 		misbehaviorsCounter = 0 // reset counter for next subtest
 	}
 }
+
+// TestLoad_Process_BatchRequest_SometimesReportSpam is a load test that ensures that a misbehavior report is generated
+// an appropriate range of times when the base probability factor and number of block IDs are set to different values.
+func (ss *SyncSuite) TestLoad_Process_BatchRequest_SometimesReportSpam() {
+	ctx, cancel := irrecoverable.NewMockSignalerContextWithCancel(ss.T(), context.Background())
+	ss.e.Start(ctx)
+	unittest.AssertClosesBefore(ss.T(), ss.e.Ready(), time.Second)
+	defer cancel()
+
+	load := 1000
+
+	// each load test is a load group that contains a set of factors with unique values to test how many misbehavior reports are generated.
+	// Due to the probabilistic nature of how misbehavior reports are generated, we use an expected lower and
+	// upper range of expected misbehaviors to determine if the load test passed or failed. As long as the number of misbehavior reports
+	// falls within the expected range, the load test passes.
+	type loadGroup struct {
+		batchRequestBaseProb      float32
+		expectedMisbehaviorsLower int
+		expectedMisbehaviorsUpper int
+		blockIDs                  []flow.Identifier
+	}
+
+	loadGroups := []loadGroup{}
+
+	// using a very small batch request (1 block ID) with a 10% base probability factor, expect to almost never get misbehavior report, about 0.003% of the time (3 in 1000 requests)
+	// expected probability factor: 0.1 * ((10-9) + 1)/64 = 0.003125
+	loadGroups = append(loadGroups, loadGroup{0.1, 0, 15, repeatedBlockIDs(1)})
+
+	// using a small batch request (10 block IDs) with a 10% base probability factor, expect to get misbehavior report about 1.7% of the time (17 in 1000 requests)
+	// expected probability factor: 0.1 * ((11-1) + 1)/64 = 0.0171875
+	loadGroups = append(loadGroups, loadGroup{0.1, 5, 31, repeatedBlockIDs(10)})
+
+	// using a large batch request (99 block IDs) with a 10% base probability factor, expect to get misbehavior report about 15% of the time (150 in 1000 requests)
+	// expected probability factor: 0.1 * ((100-1) + 1)/64 = 0.15625
+	loadGroups = append(loadGroups, loadGroup{0.1, 110, 200, repeatedBlockIDs(99)})
+
+	// using a small batch request (10 block IDs) with a 1% base probability factor, expect to almost never get misbehavior report, about 0.17% of the time (2 in 1000 requests)
+	// expected probability factor: 0.01 * ((11-1) + 1)/64 = 0.00171875
+	loadGroups = append(loadGroups, loadGroup{0.01, 0, 7, repeatedBlockIDs(10)})
+
+	// using a very large batch request (999 block IDs) with a 1% base probability factor, expect to get misbehavior report about 15% of the time (150 in 1000 requests)
+	// expected probability factor: 0.01 * ((1000-1) + 1)/64 = 0.15625
+	loadGroups = append(loadGroups, loadGroup{0.01, 110, 200, repeatedBlockIDs(999)})
+
+	// ALWAYS REPORT SPAM FOR INVALID BATCH REQUESTS OR BATCH REQUESTS THAT ARE FAR OUTSIDE OF THE TOLERANCE
+
+	// using an empty batch request (0 block IDs) always results in a misbehavior report, no matter how small the base probability factor is
+	loadGroups = append(loadGroups, loadGroup{0.001, 1000, 1000, []flow.Identifier{}})
+
+	// using a very large batch request (999 block IDs) with a 10% base probability factor, expect to get misbehavior report 100% of the time (1000 in 1000 requests)
+	// expected probability factor: 0.1 * ((999 + 1)/64 = 1.5625
+	loadGroups = append(loadGroups, loadGroup{0.1, 1000, 1000, repeatedBlockIDs(999)})
+
+	// reset misbehavior report counter for each subtest
+	misbehaviorsCounter := 0
+
+	for _, loadGroup := range loadGroups {
+		for i := 0; i < load; i++ {
+			ss.T().Log("load iteration", i)
+
+			nonce, err := rand.Uint64()
+			require.NoError(ss.T(), err, "should generate nonce")
+
+			// generate origin and request message
+			originID := unittest.IdentifierFixture()
+			req := &messages.BatchRequest{
+				Nonce:    nonce,
+				BlockIDs: loadGroup.blockIDs,
+			}
+
+			// count misbehavior reports over the course of a load test
+			ss.con.On("ReportMisbehavior", mock.Anything).Return(mock.Anything).Maybe().Run(
+				func(args mock.Arguments) {
+					misbehaviorsCounter++
+				},
+			)
+			ss.e.spamDetectionConfig.batchRequestBaseProb = loadGroup.batchRequestBaseProb
+			require.NoError(ss.T(), ss.e.Process(channels.SyncCommittee, originID, req))
+		}
+		// check function call expectations at the end of the load test; otherwise, load test would take much longer
+		ss.core.AssertExpectations(ss.T())
+		ss.con.AssertExpectations(ss.T())
+
+		// check that correct range of misbehavior reports were generated
+		// since we're using a probabilistic approach to generate misbehavior reports, we can't guarantee the exact number,
+		// so we check that it's within an expected range
+		ss.T().Logf("misbehaviors counter after load test: %d (expected lower bound: %d expected upper bound: %d)", misbehaviorsCounter, loadGroup.expectedMisbehaviorsLower, loadGroup.expectedMisbehaviorsUpper)
+		assert.GreaterOrEqual(ss.T(), misbehaviorsCounter, loadGroup.expectedMisbehaviorsLower)
+		assert.LessOrEqual(ss.T(), misbehaviorsCounter, loadGroup.expectedMisbehaviorsUpper)
+
+		misbehaviorsCounter = 0 // reset counter for next subtest
+	}
+}
+
+func repeatedBlockIDs(n int) []flow.Identifier {
+	blockID := unittest.BlockFixture().ID()
+
+	arr := make([]flow.Identifier, n)
+	for i := 0; i < n; i++ {
+		arr[i] = blockID
+	}
+	return arr
+}
diff --git a/model/messages/synchronization.go b/model/messages/synchronization.go
@@ -8,6 +8,8 @@ import (
 // SyncRequest is part of the synchronization protocol and represents a node on
 // the network sharing the height of its latest finalized block and requesting
 // the same information from the recipient.
+// All SyncRequest messages are validated before being processed. If validation fails, then a misbehavior report is created.
+// See synchronization.validateSyncRequestForALSP for more details.
 type SyncRequest struct {
 	Nonce  uint64
 	Height uint64
@@ -25,6 +27,8 @@ type SyncResponse struct {
 // (pulling) attempt to synchronize with the consensus state of the network. It
 // requests finalized blocks by a range of block heights, including from and to
 // heights.
+// All RangeRequest messages are validated before being processed. If validation fails, then a misbehavior report is created.
+// See synchronization.validateRangeRequestForALSP for more details.
 type RangeRequest struct {
 	Nonce      uint64
 	FromHeight uint64
@@ -34,6 +38,8 @@ type RangeRequest struct {
 // BatchRequest is part of the synchronization protocol and represents an active
 // (pulling) attempt to synchronize with the consensus state of the network. It
 // requests finalized or unfinalized blocks by a list of block IDs.
+// All BatchRequest messages are validated before being processed. If validation fails, then a misbehavior report is created.
+// See synchronization.validateBatchRequestForALSP for more details.
 type BatchRequest struct {
 	Nonce    uint64
 	BlockIDs []flow.Identifier