Skip to content

Commit f8aa3e2

Browse files
rmdmattinglyRay Mattingly
andauthored
HBASE-28680 BackupLogCleaner causes HMaster WALs to pile up indefinitely (#6006)
We have been trying to setup daily incremental backups for hundreds of clusters at my day job. Recently we discovered that old WALs were piling up across many clusters inline with when we began running incremental backups. This led to the realization that the BackupLogCleaner will always skip archived HMaster WALs. This is a problem because, if a cleaner is skipping a given file, then the CleanerChore will never delete it. This seems like a misunderstanding of what it means to "skip" a WAL in a BaseLogCleanerDelegate, and, instead, we should always return these HMaster WALs as deletable from the perspective of the BackupLogCleaner. We could subject them to the same scrutiny as RegionServer WALs: are they older than the most recent successful backup? But, if I understand correctly, HMaster WALs do not contain any data relevant to table backups, so that would be unnecessary. Co-authored-by: Ray Mattingly <rmattingly@hubspot.com> Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
1 parent ff19758 commit f8aa3e2

File tree

2 files changed

+57
-20
lines changed

2 files changed

+57
-20
lines changed

hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/master/BackupLogCleaner.java

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import java.util.Map;
2626
import org.apache.hadoop.conf.Configuration;
2727
import org.apache.hadoop.fs.FileStatus;
28+
import org.apache.hadoop.fs.Path;
2829
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
2930
import org.apache.hadoop.hbase.TableName;
3031
import org.apache.hadoop.hbase.backup.BackupInfo;
@@ -36,6 +37,7 @@
3637
import org.apache.hadoop.hbase.master.HMaster;
3738
import org.apache.hadoop.hbase.master.MasterServices;
3839
import org.apache.hadoop.hbase.master.cleaner.BaseLogCleanerDelegate;
40+
import org.apache.hadoop.hbase.master.region.MasterRegionFactory;
3941
import org.apache.hadoop.hbase.net.Address;
4042
import org.apache.hadoop.hbase.procedure2.store.wal.WALProcedureStore;
4143
import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
@@ -123,27 +125,8 @@ public Iterable<FileStatus> getDeletableFiles(Iterable<FileStatus> files) {
123125
return Collections.emptyList();
124126
}
125127
for (FileStatus file : files) {
126-
String fn = file.getPath().getName();
127-
if (fn.startsWith(WALProcedureStore.LOG_PREFIX)) {
128+
if (canDeleteFile(addressToLastBackupMap, file.getPath())) {
128129
filteredFiles.add(file);
129-
continue;
130-
}
131-
132-
try {
133-
Address walServerAddress =
134-
Address.fromString(BackupUtils.parseHostNameFromLogFile(file.getPath()));
135-
long walTimestamp = AbstractFSWALProvider.getTimestamp(file.getPath().getName());
136-
137-
if (
138-
!addressToLastBackupMap.containsKey(walServerAddress)
139-
|| addressToLastBackupMap.get(walServerAddress) >= walTimestamp
140-
) {
141-
filteredFiles.add(file);
142-
}
143-
} catch (Exception ex) {
144-
LOG.warn(
145-
"Error occurred while filtering file: {} with error: {}. Ignoring cleanup of this log",
146-
file.getPath(), ex.getMessage());
147130
}
148131
}
149132

@@ -176,4 +159,39 @@ public void stop(String why) {
176159
public boolean isStopped() {
177160
return this.stopped;
178161
}
162+
163+
protected static boolean canDeleteFile(Map<Address, Long> addressToLastBackupMap, Path path) {
164+
if (isHMasterWAL(path)) {
165+
return true;
166+
}
167+
168+
try {
169+
String hostname = BackupUtils.parseHostNameFromLogFile(path);
170+
if (hostname == null) {
171+
LOG.warn(
172+
"Cannot parse hostname from RegionServer WAL file: {}. Ignoring cleanup of this log",
173+
path);
174+
return false;
175+
}
176+
Address walServerAddress = Address.fromString(hostname);
177+
long walTimestamp = AbstractFSWALProvider.getTimestamp(path.getName());
178+
179+
if (
180+
!addressToLastBackupMap.containsKey(walServerAddress)
181+
|| addressToLastBackupMap.get(walServerAddress) >= walTimestamp
182+
) {
183+
return true;
184+
}
185+
} catch (Exception ex) {
186+
LOG.warn("Error occurred while filtering file: {}. Ignoring cleanup of this log", path, ex);
187+
return false;
188+
}
189+
return false;
190+
}
191+
192+
private static boolean isHMasterWAL(Path path) {
193+
String fn = path.getName();
194+
return fn.startsWith(WALProcedureStore.LOG_PREFIX)
195+
|| fn.endsWith(MasterRegionFactory.ARCHIVED_WAL_SUFFIX);
196+
}
179197
}

hbase-backup/src/test/java/org/apache/hadoop/hbase/backup/master/TestBackupLogCleaner.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@
2020
import static org.junit.Assert.assertFalse;
2121
import static org.junit.Assert.assertTrue;
2222

23+
import java.util.Collections;
2324
import java.util.HashMap;
2425
import java.util.List;
2526
import java.util.Map;
2627
import org.apache.hadoop.fs.FileStatus;
28+
import org.apache.hadoop.fs.Path;
2729
import org.apache.hadoop.hbase.HBaseClassTestRule;
2830
import org.apache.hadoop.hbase.TableName;
2931
import org.apache.hadoop.hbase.backup.BackupType;
@@ -132,4 +134,21 @@ public void testBackupLogCleaner() throws Exception {
132134
conn.close();
133135
}
134136
}
137+
138+
@Test
139+
public void testCleansUpHMasterWal() {
140+
Path path = new Path("/hbase/MasterData/WALs/hmaster,60000,1718808578163");
141+
assertTrue(BackupLogCleaner.canDeleteFile(Collections.emptyMap(), path));
142+
}
143+
144+
@Test
145+
public void testCleansUpArchivedHMasterWal() {
146+
Path normalPath =
147+
new Path("/hbase/oldWALs/hmaster%2C60000%2C1716224062663.1716247552189$masterlocalwal$");
148+
assertTrue(BackupLogCleaner.canDeleteFile(Collections.emptyMap(), normalPath));
149+
150+
Path masterPath = new Path(
151+
"/hbase/MasterData/oldWALs/hmaster%2C60000%2C1716224062663.1716247552189$masterlocalwal$");
152+
assertTrue(BackupLogCleaner.canDeleteFile(Collections.emptyMap(), masterPath));
153+
}
135154
}

0 commit comments

Comments
 (0)