Skip to content

Commit e780dc2

Browse files
committed
Add support to take parallel backups
1 parent 72d5a46 commit e780dc2

18 files changed

+540
-216
lines changed
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
@startuml
2+
/'
3+
/**
4+
* Licensed to the Apache Software Foundation (ASF) under one
5+
* or more contributor license agreements. See the NOTICE file
6+
* distributed with this work for additional information
7+
* regarding copyright ownership. The ASF licenses this file
8+
* to you under the Apache License, Version 2.0 (the
9+
* "License"); you may not use this file except in compliance
10+
* with the License. You may obtain a copy of the License at
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
'/
19+
start
20+
: (1.1) Create Backup;
21+
if ( hasActiveSession or inInconsistentState?) then (yes)
22+
: Fail;
23+
stop
24+
else (no)
25+
: (1.2.1) Create Backup Request and execute;
26+
: (1.2.2) Create dirs in destination;
27+
: (1.2.3) Create Backup Client and execute;
28+
: (1.2.4) Create exclusive backup session;
29+
: (1.2.5) Take Backup Table Snapshot;
30+
: (1.2.6) Set State to RUNNING and phase to REQUEST;
31+
if ( full backup?) then (yes)
32+
: (1.2.7.1.1) Read last backup start time or 0L;
33+
: (1.2.7.1.2) Perform LogRoll Procedure;
34+
: (1.2.7.1.3) Record WAL older than LogRoll to system table;
35+
: (1.2.7.1.4) Set Phase to SNAPSHOT;
36+
: (1.2.7.1.5) Take Snapshot of every table;
37+
: (1.2.7.1.6) Export Snapshot to dest dir;
38+
: (1.2.7.1.7) Write start time for next backup to system table;
39+
: (1.2.7.1.8) Add Manifest;
40+
: (1.2.7.1.9) Delete Snapshots;
41+
: (1.2.7.1.10) Cleanup Export snapshot log;
42+
else (no)
43+
: (1.2.7.2.1) Set phase to PREPARE_INCREMENTAL;
44+
if (fail to get log file map?) then (yes)
45+
: Fail;
46+
stop
47+
else (no)
48+
: (1.2.7.2.2) Copy table and region info;
49+
: (1.2.7.2.3) MR to convert WAL into HFiles;
50+
: (1.2.7.2.4) Copy HFiles into dest with DistCP
51+
: (1.2.7.2.5) Record WAL older than what is copied;
52+
: (1.2.7.2.6) Write start time for next backup to system table;
53+
: (1.2.7.2.7) Add Manifest;
54+
: (1.2.7.2.8) Cleanup DistCp log;
55+
endif
56+
endif
57+
: (1.2.8) Delete System Table Snapshot;
58+
: (1.2.9) Update BackupInfo with Status Complete;
59+
: (1.2.10) Clear exclusive backup sesion;
60+
endif
61+
stop
62+
63+
start
64+
: Fail;
65+
: (2.1) Set State to FAILED;
66+
if (full backup?) then (yes)
67+
: (2.2.1.1) Delete all snapshots;
68+
: (2.2.1.2)Cleanup export snapshot log;
69+
endif
70+
: (2.3) Restore backup system table from snapshot;
71+
: (2.4) Delete backup system table snapshot;
72+
: (2.5) Cleanup Target Dir;
73+
stop
74+
@enduml
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
@startuml
2+
/'
3+
/**
4+
* Licensed to the Apache Software Foundation (ASF) under one
5+
* or more contributor license agreements. See the NOTICE file
6+
* distributed with this work for additional information
7+
* regarding copyright ownership. The ASF licenses this file
8+
* to you under the Apache License, Version 2.0 (the
9+
* "License"); you may not use this file except in compliance
10+
* with the License. You may obtain a copy of the License at
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
'/
19+
20+
start
21+
: (1.1) Create Backup;
22+
: (1.2.1) Create Backup Request and execute;
23+
: (1.2.2) Create dirs in destination;
24+
: (1.2.3) Create Backup Client and execute;
25+
#Orange: (1.2.4) Create exclusive backup session;
26+
note right: Table exclusive lock
27+
#Red: (1.2.5) Take Backup Table Snapshot;
28+
note right: To be removed
29+
: (1.2.6) Set State to RUNNING and phase to REQUEST;
30+
if ( full backup?) then (yes)
31+
: (1.2.7.1.1) Read last backup start time or 0L;
32+
: (1.2.7.1.2) Perform LogRoll Procedure;
33+
: (1.2.7.1.3) Record WAL older than LogRoll to system table;
34+
: (1.2.7.1.4) Set Phase to SNAPSHOT;
35+
: (1.2.7.1.5) Take Snapshot of every table;
36+
: (1.2.7.1.6) Export Snapshot to dest dir;
37+
: (1.2.7.1.7) Write start time for next backup to system table;
38+
: (1.2.7.1.8) Add Manifest;
39+
: (1.2.7.1.9) Delete Snapshots;
40+
: (1.2.7.1.10) Cleanup Export snapshot log;
41+
else (no)
42+
: (1.2.7.2.1) Set phase to PREPARE_INCREMENTAL;
43+
if (fail to get log file map?) then (yes)
44+
: Fail;
45+
stop
46+
else (no)
47+
: (1.2.7.2.2) Copy table and region info;
48+
: (1.2.7.2.3) MR to convert WAL into HFiles;
49+
: (1.2.7.2.4) Copy HFiles into dest with DistCP
50+
: (1.2.7.2.5) Record WAL older than what is copied;
51+
: (1.2.7.2.6) Write start time for next backup to system table;
52+
: (1.2.7.2.7) Add Manifest;
53+
: (1.2.7.2.8) Cleanup DistCp log;
54+
endif
55+
endif
56+
#RED: (1.2.8) Delete System Table Snapshot;
57+
note right: To be removed
58+
: (1.2.9) Update BackupInfo with Status Complete;
59+
#Orange: (1.2.10) Clear exclusive backup sesion;
60+
note right: Clear table exclusive lock
61+
stop
62+
63+
start
64+
: Fail;
65+
: (2.1) Set State to FAILED;
66+
if (full backup?) then (yes)
67+
: (2.2.1.1) Delete all snapshots;
68+
: (2.2.1.2) Cleanup export snapshot log;
69+
endif
70+
#RED: (2.3) Restore backup system table from snapshot;
71+
note right: To be removed
72+
#RED: (2.4) Delete backup system table snapshot;
73+
note right: To be removed
74+
: (2.5) Cleanup Target Dir;
75+
stop
76+
@enduml

hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/BackupRestoreConstants.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,13 @@ public interface BackupRestoreConstants {
114114

115115
String CONF_STAGING_ROOT = "snapshot.export.staging.root";
116116

117-
String BACKUPID_PREFIX = "backup_";
117+
String BACKUPID_PREFIX = "backup";
118+
119+
String UNDERSCORE = "_";
120+
121+
static String getBackupPrefix() {
122+
return BackupRestoreConstants.BACKUPID_PREFIX + BackupRestoreConstants.UNDERSCORE;
123+
}
118124

119125
enum BackupCommand {
120126
CREATE,

hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupAdminImpl.java

Lines changed: 10 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import java.io.IOException;
2121
import java.util.ArrayList;
22+
import java.util.Arrays;
2223
import java.util.Collections;
2324
import java.util.HashMap;
2425
import java.util.HashSet;
@@ -94,15 +95,18 @@ public int deleteBackups(String[] backupIds) throws IOException {
9495

9596
int totalDeleted = 0;
9697
Map<String, HashSet<TableName>> allTablesMap = new HashMap<>();
98+
List<BackupInfo> backupInfos = new ArrayList<>();
99+
for (String backupId : backupIds) {
100+
backupInfos.add(getBackupInfo(backupId));
101+
}
97102

98103
boolean deleteSessionStarted;
99-
boolean snapshotDone;
100104
try (final BackupSystemTable sysTable = new BackupSystemTable(conn)) {
101105
// Step 1: Make sure there is no active session
102106
// is running by using startBackupSession API
103107
// If there is an active session in progress, exception will be thrown
104108
try {
105-
sysTable.startBackupExclusiveOperation();
109+
sysTable.startBackupExclusiveOperation(backupInfos);
106110
deleteSessionStarted = true;
107111
} catch (IOException e) {
108112
LOG.warn("You can not run delete command while active backup session is in progress. \n"
@@ -121,13 +125,6 @@ public int deleteBackups(String[] backupIds) throws IOException {
121125

122126
// Step 3: Record delete session
123127
sysTable.startDeleteOperation(backupIds);
124-
// Step 4: Snapshot backup system table
125-
if (!BackupSystemTable.snapshotExists(conn)) {
126-
BackupSystemTable.snapshot(conn);
127-
} else {
128-
LOG.warn("Backup system table snapshot exists");
129-
}
130-
snapshotDone = true;
131128
try {
132129
for (int i = 0; i < backupIds.length; i++) {
133130
BackupInfo info = sysTable.readBackupInfo(backupIds[i]);
@@ -145,28 +142,11 @@ public int deleteBackups(String[] backupIds) throws IOException {
145142
finalizeDelete(allTablesMap, sysTable);
146143
// Finish
147144
sysTable.finishDeleteOperation();
148-
// delete snapshot
149-
BackupSystemTable.deleteSnapshot(conn);
150145
} catch (IOException e) {
151-
// Fail delete operation
152-
// Step 1
153-
if (snapshotDone) {
154-
if (BackupSystemTable.snapshotExists(conn)) {
155-
BackupSystemTable.restoreFromSnapshot(conn);
156-
// delete snapshot
157-
BackupSystemTable.deleteSnapshot(conn);
158-
// We still have record with unfinished delete operation
159-
LOG.error("Delete operation failed, please run backup repair utility to restore "
160-
+ "backup system integrity", e);
161-
throw e;
162-
} else {
163-
LOG.warn("Delete operation succeeded, there were some errors: ", e);
164-
}
165-
}
166-
146+
LOG.warn("Delete operation succeeded, there were some errors: ", e);
167147
} finally {
168148
if (deleteSessionStarted) {
169-
sysTable.finishBackupExclusiveOperation();
149+
sysTable.finishBackupExclusiveOperation(Arrays.asList(backupIds));
170150
}
171151
}
172152
}
@@ -524,7 +504,8 @@ public String backupTables(BackupRequest request) throws IOException {
524504
String targetRootDir = request.getTargetRootDir();
525505
List<TableName> tableList = request.getTableList();
526506

527-
String backupId = BackupRestoreConstants.BACKUPID_PREFIX + EnvironmentEdgeManager.currentTime();
507+
String backupId =
508+
BackupRestoreConstants.getBackupPrefix() + EnvironmentEdgeManager.currentTime();
528509
if (type == BackupType.INCREMENTAL) {
529510
Set<TableName> incrTableSet;
530511
try (BackupSystemTable table = new BackupSystemTable(conn)) {

hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupCommands.java

Lines changed: 29 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242

4343
import java.io.IOException;
4444
import java.net.URI;
45+
import java.util.Arrays;
4546
import java.util.List;
4647
import org.apache.commons.lang3.StringUtils;
4748
import org.apache.hadoop.conf.Configuration;
@@ -258,7 +259,7 @@ public static class CreateCommand extends Command {
258259

259260
@Override
260261
protected boolean requiresNoActiveSession() {
261-
return true;
262+
return false;
262263
}
263264

264265
@Override
@@ -335,12 +336,12 @@ public void execute() throws IOException {
335336
System.setProperty("mapreduce.job.queuename", queueName);
336337
}
337338

339+
List<TableName> tablesList = Lists.newArrayList(BackupUtils.parseTableNames(tables));
340+
338341
try (BackupAdminImpl admin = new BackupAdminImpl(conn)) {
339342
BackupRequest.Builder builder = new BackupRequest.Builder();
340343
BackupRequest request = builder.withBackupType(BackupType.valueOf(args[1].toUpperCase()))
341-
.withTableList(
342-
tables != null ? Lists.newArrayList(BackupUtils.parseTableNames(tables)) : null)
343-
.withTargetRootDir(targetBackupDir).withTotalTasks(workers)
344+
.withTableList(tablesList).withTargetRootDir(targetBackupDir).withTotalTasks(workers)
344345
.withBandwidthPerTasks(bandwidth).withBackupSetName(setName).build();
345346
String backupId = admin.backupTables(request);
346347
System.out.println("Backup session " + backupId + " finished. Status: SUCCESS");
@@ -672,35 +673,35 @@ public void execute() throws IOException {
672673
try (final Connection conn = ConnectionFactory.createConnection(conf);
673674
final BackupSystemTable sysTable = new BackupSystemTable(conn)) {
674675
// Failed backup
675-
BackupInfo backupInfo;
676-
List<BackupInfo> list = sysTable.getBackupInfos(BackupState.RUNNING);
677-
if (list.size() == 0) {
676+
List<BackupInfo> backupInfos = sysTable.getBackupInfos(BackupState.RUNNING);
677+
if (backupInfos.size() == 0) {
678678
// No failed sessions found
679679
System.out.println("REPAIR status: no failed sessions found."
680680
+ " Checking failed delete backup operation ...");
681681
repairFailedBackupDeletionIfAny(conn, sysTable);
682682
repairFailedBackupMergeIfAny(conn, sysTable);
683683
return;
684684
}
685-
backupInfo = list.get(0);
686-
// If this is a cancel exception, then we've already cleaned.
687-
// set the failure timestamp of the overall backup
688-
backupInfo.setCompleteTs(EnvironmentEdgeManager.currentTime());
689-
// set failure message
690-
backupInfo.setFailedMsg("REPAIR status: repaired after failure:\n" + backupInfo);
691-
// set overall backup status: failed
692-
backupInfo.setState(BackupState.FAILED);
693-
// compose the backup failed data
694-
String backupFailedData = "BackupId=" + backupInfo.getBackupId() + ",startts="
695-
+ backupInfo.getStartTs() + ",failedts=" + backupInfo.getCompleteTs() + ",failedphase="
696-
+ backupInfo.getPhase() + ",failedmessage=" + backupInfo.getFailedMsg();
697-
System.out.println(backupFailedData);
698-
TableBackupClient.cleanupAndRestoreBackupSystem(conn, backupInfo, conf);
699-
// If backup session is updated to FAILED state - means we
700-
// processed recovery already.
701-
sysTable.updateBackupInfo(backupInfo);
702-
sysTable.finishBackupExclusiveOperation();
703-
System.out.println("REPAIR status: finished repair failed session:\n " + backupInfo);
685+
for (BackupInfo backupInfo : backupInfos) {
686+
// If this is a cancel exception, then we've already cleaned.
687+
// set the failure timestamp of the overall backup
688+
backupInfo.setCompleteTs(EnvironmentEdgeManager.currentTime());
689+
// set failure message
690+
backupInfo.setFailedMsg("REPAIR status: repaired after failure:\n" + backupInfo);
691+
// set overall backup status: failed
692+
backupInfo.setState(BackupState.FAILED);
693+
// compose the backup failed data
694+
String backupFailedData = "BackupId=" + backupInfo.getBackupId() + ",startts="
695+
+ backupInfo.getStartTs() + ",failedts=" + backupInfo.getCompleteTs() + ",failedphase="
696+
+ backupInfo.getPhase() + ",failedmessage=" + backupInfo.getFailedMsg();
697+
System.out.println(backupFailedData);
698+
TableBackupClient.cleanupAndRestoreBackupSystem(conn, backupInfo, conf);
699+
// If backup session is updated to FAILED state - means we
700+
// processed recovery already.
701+
sysTable.updateBackupInfo(backupInfo);
702+
sysTable.finishBackupExclusiveOperation(Arrays.asList(backupInfo.getBackupId()));
703+
System.out.println("REPAIR status: finished repair failed session:\n " + backupInfo);
704+
}
704705
}
705706
}
706707

@@ -709,16 +710,12 @@ private void repairFailedBackupDeletionIfAny(Connection conn, BackupSystemTable
709710
String[] backupIds = sysTable.getListOfBackupIdsFromDeleteOperation();
710711
if (backupIds == null || backupIds.length == 0) {
711712
System.out.println("No failed backup DELETE operation found");
712-
// Delete backup table snapshot if exists
713-
BackupSystemTable.deleteSnapshot(conn);
714713
return;
715714
}
716715
System.out.println("Found failed DELETE operation for: " + StringUtils.join(backupIds));
717716
System.out.println("Running DELETE again ...");
718-
// Restore table from snapshot
719-
BackupSystemTable.restoreFromSnapshot(conn);
720717
// Finish previous failed session
721-
sysTable.finishBackupExclusiveOperation();
718+
sysTable.finishBackupExclusiveOperation(Arrays.asList(backupIds));
722719
try (BackupAdmin admin = new BackupAdminImpl(conn)) {
723720
admin.deleteBackups(backupIds);
724721
}
@@ -731,8 +728,6 @@ public static void repairFailedBackupMergeIfAny(Connection conn, BackupSystemTab
731728
String[] backupIds = sysTable.getListOfBackupIdsFromMergeOperation();
732729
if (backupIds == null || backupIds.length == 0) {
733730
System.out.println("No failed backup MERGE operation found");
734-
// Delete backup table snapshot if exists
735-
BackupSystemTable.deleteSnapshot(conn);
736731
return;
737732
}
738733
System.out.println("Found failed MERGE operation for: " + StringUtils.join(backupIds));
@@ -758,10 +753,8 @@ public static void repairFailedBackupMergeIfAny(Connection conn, BackupSystemTab
758753
} else {
759754
checkRemoveBackupImages(fs, backupRoot, backupIds);
760755
}
761-
// Restore table from snapshot
762-
BackupSystemTable.restoreFromSnapshot(conn);
763756
// Unlock backup system
764-
sysTable.finishBackupExclusiveOperation();
757+
sysTable.finishBackupExclusiveOperation(Arrays.asList(backupIds));
765758
// Finish previous failed session
766759
sysTable.finishMergeOperation();
767760

0 commit comments

Comments
 (0)