Skip to content

Commit 391dfda

Browse files
authored
HBASE-28140 AbstractWALProvider may miss the WAL which is under creation in getWALs method (#5455)
Signed-off-by: GeorryHuang <huangzhuoyue@apache.org> Signed-off-by: Xiaolin Ha <haxiaolin@apache.org> Signed-off-by: Wellington Chevreuil <wchevreuil@apache.org>
1 parent 8a9ad07 commit 391dfda

File tree

1 file changed

+37
-0
lines changed

1 file changed

+37
-0
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractWALProvider.java

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424
import java.util.concurrent.ConcurrentHashMap;
2525
import java.util.concurrent.ConcurrentMap;
2626
import java.util.concurrent.atomic.AtomicBoolean;
27+
import java.util.concurrent.locks.Condition;
2728
import java.util.concurrent.locks.Lock;
29+
import java.util.concurrent.locks.ReentrantLock;
2830
import java.util.function.BiPredicate;
2931
import java.util.regex.Matcher;
3032
import java.util.regex.Pattern;
@@ -87,6 +89,15 @@ public abstract class AbstractWALProvider implements WALProvider, PeerActionList
8789

8890
private final KeyLocker<String> createLock = new KeyLocker<>();
8991

92+
// in getWALs we can not throw any exceptions out, so we use lock and condition here as it
93+
// supports awaitUninterruptibly which will not throw a InterruptedException
94+
private final Lock numRemoteWALUnderCreationLock = new ReentrantLock();
95+
private final Condition noRemoteWALUnderCreationCond =
96+
numRemoteWALUnderCreationLock.newCondition();
97+
// record the number of remote WALs which are under creation. This is very important to not
98+
// missing a WAL instance in getWALs method. See HBASE-28140 and related issues for more details.
99+
private int numRemoteWALUnderCreation;
100+
90101
// we need to have this because when getting meta wal, there is no peer info provider yet.
91102
private SyncReplicationPeerInfoProvider peerInfoProvider = new SyncReplicationPeerInfoProvider() {
92103

@@ -150,11 +161,26 @@ private WAL getRemoteWAL(RegionInfo region, String peerId, String remoteWALDir)
150161
WAL wal = createRemoteWAL(region, ReplicationUtils.getRemoteWALFileSystem(conf, remoteWALDir),
151162
ReplicationUtils.getPeerRemoteWALDir(remoteWALDir, peerId), getRemoteWALPrefix(peerId),
152163
ReplicationUtils.SYNC_WAL_SUFFIX);
164+
numRemoteWALUnderCreationLock.lock();
165+
try {
166+
numRemoteWALUnderCreation++;
167+
} finally {
168+
numRemoteWALUnderCreationLock.unlock();
169+
}
153170
initWAL(wal);
154171
peerId2WAL.put(peerId, Optional.of(wal));
155172
return wal;
156173
} finally {
157174
lock.unlock();
175+
numRemoteWALUnderCreationLock.lock();
176+
try {
177+
numRemoteWALUnderCreation--;
178+
if (numRemoteWALUnderCreation == 0) {
179+
noRemoteWALUnderCreationCond.signalAll();
180+
}
181+
} finally {
182+
numRemoteWALUnderCreationLock.unlock();
183+
}
158184
}
159185
}
160186

@@ -179,6 +205,17 @@ public final WAL getWAL(RegionInfo region) throws IOException {
179205

180206
@Override
181207
public final List<WAL> getWALs() {
208+
List<WAL> wals = new ArrayList<WAL>();
209+
numRemoteWALUnderCreationLock.lock();
210+
try {
211+
while (numRemoteWALUnderCreation > 0) {
212+
noRemoteWALUnderCreationCond.awaitUninterruptibly();
213+
}
214+
peerId2WAL.values().stream().filter(Optional::isPresent).map(Optional::get)
215+
.forEach(wals::add);
216+
} finally {
217+
numRemoteWALUnderCreationLock.unlock();
218+
}
182219
return Streams
183220
.concat(peerId2WAL.values().stream().filter(Optional::isPresent).map(Optional::get),
184221
getWALs0().stream())

0 commit comments

Comments
 (0)