Skip to content

HBASE-26482 HMaster may clean wals that is replicating in rare cases #3876

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,11 @@ public long getWALPosition(ServerName serverName, String queueId, String fileNam
return 0;
}

/**
* This implement must update the cversion of root {@link #queuesZNode}. The optimistic lock of
* the {@link #getAllWALs()} method is based on the cversion of root {@link #queuesZNode}.
* @see #getAllWALs() to show the usage of the cversion of root {@link #queuesZNode} .
*/
@Override
public Pair<String, SortedSet<String>> claimQueue(ServerName sourceServerName, String queueId,
ServerName destServerName) throws ReplicationException {
Expand Down Expand Up @@ -417,6 +422,12 @@ public Pair<String, SortedSet<String>> claimQueue(ServerName sourceServerName, S
}
// add delete op for peer
listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldQueueNode));
// Append new queue id for prevent lock competition in zookeeper server.
String claimLockZNode = ZNodePaths.joinZNode(queuesZNode, "cversion_" + newQueueId);
// A trick for update the cversion of root queuesZNode .
// The optimistic lock of the getAllWALs() method is based on the cversion of root queuesZNode
listOfOps.add(ZKUtilOp.createAndFailSilent(claimLockZNode, HConstants.EMPTY_BYTE_ARRAY));
listOfOps.add(ZKUtilOp.deleteNodeFailSilent(claimLockZNode));

LOG.trace("The multi list size is {}", listOfOps.size());
ZKUtil.multiOrSequential(zookeeper, listOfOps, false);
Expand Down Expand Up @@ -505,6 +516,13 @@ protected int getQueuesZNodeCversion() throws KeeperException {
return stat.getCversion();
}

/**
* The optimistic lock of this implement is based on the cversion of root {@link #queuesZNode}.
* Therefore, we must update the cversion of root {@link #queuesZNode} when migrate wal nodes to
* other queues.
* @see #claimQueue(ServerName, String, ServerName) as an example of updating root
* {@link #queuesZNode} cversion.
*/
@Override
public Set<String> getAllWALs() throws ReplicationException {
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,18 +206,29 @@ public void testAddRemoveLog() throws ReplicationException {
}
}

// For HBASE-12865
// For HBASE-12865, HBASE-26482
@Test
public void testClaimQueueChangeCversion() throws ReplicationException, KeeperException {
ServerName serverName1 = ServerName.valueOf("127.0.0.1", 8000, 10000);
STORAGE.addWAL(serverName1, "1", "file");
STORAGE.addWAL(serverName1, "2", "file");

int v0 = STORAGE.getQueuesZNodeCversion();
ServerName serverName2 = ServerName.valueOf("127.0.0.1", 8001, 10001);
// Avoid claimQueue update cversion for prepare server2 rsNode.
STORAGE.addWAL(serverName2, "1", "file");
STORAGE.addWAL(serverName2, "2", "file");

int v0 = STORAGE.getQueuesZNodeCversion();

STORAGE.claimQueue(serverName1, "1", serverName2);
int v1 = STORAGE.getQueuesZNodeCversion();
// cversion should increase by 1 since a child node is deleted
assertEquals(1, v1 - v0);
// cversion should be increased by claimQueue method.
assertTrue(v1 > v0);

STORAGE.claimQueue(serverName1, "2", serverName2);
int v2 = STORAGE.getQueuesZNodeCversion();
// cversion should be increased by claimQueue method.
assertTrue(v2 > v1);
}

private ZKReplicationQueueStorage createWithUnstableVersion() throws IOException {
Expand Down