Skip to content

Commit

Permalink
[fix](merge-on-write) when full clone failed, duplicate key might occ…
Browse files Browse the repository at this point in the history
…ur (apache#37001)

## Proposed changes

Issue Number: close #xxx

introduced by apache#31268

full clone failure may produce duplicate keys in mow table
the bug would be triggered in the following condition:
1. replica 0 miss version
2. replica 0 try to do full clone from other replicas
3. the full clone failed and the delete bitmap is overrided incorrectly
4. replica 0 try to do incremental clone again and this time the clone
succeed
5. incremental clone can't fix the delete bitmap overrided by previous
failed full clone
6. duplicate key occurred

solution:
for full clone, don't override the delete bitmap, use `merge()` method
instead.
  • Loading branch information
zhannngchen committed Jul 3, 2024
1 parent 45fc1c7 commit 9a946c4
Show file tree
Hide file tree
Showing 4 changed files with 191 additions and 0 deletions.
9 changes: 9 additions & 0 deletions be/src/olap/snapshot_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,15 @@ Status SnapshotManager::_create_snapshot_files(const TabletSharedPtr& ref_tablet
}
}

DBUG_EXECUTE_IF("SnapshotManager.create_snapshot_files.allow_inc_clone", {
auto tablet_id = dp->param("tablet_id", 0);
auto is_full_clone = dp->param("is_full_clone", false);
if (ref_tablet->tablet_id() == tablet_id && is_full_clone) {
LOG(INFO) << "injected full clone for tabelt: " << tablet_id;
res = Status::InternalError("fault injection error");
}
});

// be would definitely set it as true no matter has missed version or not, we could
// just check whether the missed version is empty or not
int64_t version = -1;
Expand Down
8 changes: 8 additions & 0 deletions be/src/olap/tablet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,14 @@ Status Tablet::revise_tablet_meta(const std::vector<RowsetSharedPtr>& to_add,
break; // while (keys_type() == UNIQUE_KEYS && enable_unique_key_merge_on_write())
}

DBUG_EXECUTE_IF("Tablet.revise_tablet_meta_fail", {
auto ptablet_id = dp->param("tablet_id", 0);
if (tablet_id() == ptablet_id) {
LOG(INFO) << "injected revies_tablet_meta failure for tabelt: " << ptablet_id;
calc_bm_status = Status::InternalError("fault injection error");
}
});

// error handling
if (!calc_bm_status.ok()) {
if (is_incremental_clone) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
1 10
2 200
3 30
4 400
5 500
6 600
7 7
8 8
9 9
10 10

-- !sql --
1 10
2 200
3 30
4 400
5 500
6 600
7 7
8 8
9 9
10 10

-- !sql --
1 10
2 200
3 30
4 400
5 500
6 600
7 7
8 8
9 9
10 10

Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

import org.apache.doris.regression.suite.ClusterOptions
import org.apache.doris.regression.util.NodeType

// This case can reproduce an clone issue which may produce duplicate keys in mow table
// the bug would be triggered in the following condition:
// 1. replica 0 miss version
// 2. replica 0 try to do full clone from other replicas
// 3. the full clone failed and the delete bitmap is overrided incorrectly
// 4. replica 0 try to do incremental clone again and this time the clone succeed
// 5. incremental clone can't fix the delete bitmap overrided by previous failed full clone
// 6. duplicate key occurred
//
// the bug is fixed in #37001

suite('test_full_clone_exception') {
def options = new ClusterOptions()
options.feConfigs += [
'disable_tablet_scheduler=true',
'tablet_checker_interval_ms=500',
'schedule_batch_size=1000',
'schedule_slot_num_per_hdd_path=1000',
]
options.beConfigs += [
'disable_auto_compaction=true',
'report_tablet_interval_seconds=1',
]

options.enableDebugPoints()
options.cloudMode = false
docker(options) {
def txnFailureInject = 'TxnManager.prepare_txn.random_failed'
def fullCloneInject = 'SnapshotManager.create_snapshot_files.allow_inc_clone'
def reviseTabletMetaInject='Tablet.revise_tablet_meta_fail'
def be1 = sql_return_maparray('show backends').get(0)
def be2 = sql_return_maparray('show backends').get(1)
def be3 = sql_return_maparray('show backends').get(2)

def addInjectToBE = { beIdx, injectName, param ->
def be = sql_return_maparray('show backends').get(beIdx)
GetDebugPoint().enableDebugPoint(be.Host, be.HttpPort as int, NodeType.BE, injectName, param)
}

def deleteInjectOnBE = { beIdx, injectName ->
def be = sql_return_maparray('show backends').get(beIdx)
GetDebugPoint().disableDebugPoint(be.Host, be.HttpPort as int, NodeType.BE, injectName)
}

sql """
CREATE TABLE IF NOT EXISTS t (
k int,
v int
)
UNIQUE KEY(k)
DISTRIBUTED BY HASH(k) BUCKETS 1 properties(
"enable_unique_key_merge_on_write" = "true"
);
"""

sql 'INSERT INTO t VALUES(1,1),(2,2),(3,3),(4,4)'
sql 'INSERT INTO t VALUES(1,10),(2,20),(3,30),(4,40)'

// inject txn failure, make replica 0 miss version
addInjectToBE(0, txnFailureInject, [percent:1.0])

sql 'INSERT INTO t VALUES(2,200),(4,400),(5,500),(6,600)'
sql 'INSERT INTO t VALUES(7,7)'
sql 'INSERT INTO t VALUES(8,8)'
sql 'INSERT INTO t VALUES(9,9)'

deleteInjectOnBE(0, txnFailureInject)

sql 'INSERT INTO t VALUES(10,10)'

sleep 5000

// check replica 0 miss version
def replica1 = sql_return_maparray('show tablets from t').find { it.BackendId.toLong().equals(be1.BackendId.toLong()) }
assertNotNull(replica1)
assertEquals(3, replica1.VersionCount.toInteger())
assertEquals(3, replica1.Version.toInteger())
assertEquals(8, replica1.LstFailedVersion.toInteger())

def tabletId = replica1.TabletId
// inject failure on replica 0, which can't clone succeed
addInjectToBE(0, reviseTabletMetaInject, [tablet_id:tabletId])
// inject on replica 1, force replica 0 full clone from them
addInjectToBE(1, fullCloneInject, [tablet_id:tabletId, is_full_clone:true])
addInjectToBE(2, fullCloneInject, [tablet_id:tabletId, is_full_clone:true])

// start clone
setFeConfig('disable_tablet_scheduler', false)

sleep 10000

// now, there's lots of full clone failures, remove all debug points, make
// replica 0 can do normal incremental clone from other replicas
deleteInjectOnBE(0, reviseTabletMetaInject)
deleteInjectOnBE(1, fullCloneInject)
deleteInjectOnBE(2, fullCloneInject)

sleep 10000

// make sure the clone succeed
replica1 = sql_return_maparray('show tablets from t').find { it.BackendId.toLong().equals(be1.BackendId.toLong()) }
assertNotNull(replica1)
assertEquals(8, replica1.VersionCount.toInteger())
assertEquals(8, replica1.Version.toInteger())
assertEquals(-1, replica1.LstFailedVersion.toInteger())

// three replica's content should be consistent
sql 'set use_fix_replica=0'
qt_sql 'select * from t order by k'

sql 'set use_fix_replica=1'
qt_sql 'select * from t order by k'

sql 'set use_fix_replica=2'
qt_sql 'select * from t order by k'
}
}

0 comments on commit 9a946c4

Please sign in to comment.