Skip to content

Commit bc6bea5

Browse files
amoghRZPDaveCTurner
authored andcommitted
Remove node from cluster when node locks broken (#61400)
In #52680 we introduced a mechanism that will allow nodes to remove themselves from the cluster if they locally determine themselves to be unhealthy. The only check today is that their data paths are all empirically writeable. This commit extends this check to consider a failure of `NodeEnvironment#assertEnvIsLocked()` to be an indication of unhealthiness. Closes #58373
1 parent aa0dc56 commit bc6bea5

File tree

2 files changed

+84
-4
lines changed

2 files changed

+84
-4
lines changed

server/src/main/java/org/elasticsearch/monitor/fs/FsHealthService.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ public class FsHealthService extends AbstractLifecycleComponent implements NodeH
5858
private static final Logger logger = LogManager.getLogger(FsHealthService.class);
5959
private final ThreadPool threadPool;
6060
private volatile boolean enabled;
61+
private volatile boolean brokenLock;
6162
private final TimeValue refreshInterval;
6263
private volatile TimeValue slowPathLoggingThreshold;
6364
private final NodeEnvironment nodeEnv;
@@ -117,6 +118,8 @@ public StatusInfo getHealth() {
117118
Set<Path> unhealthyPaths = this.unhealthyPaths;
118119
if (enabled == false) {
119120
statusInfo = new StatusInfo(HEALTHY, "health check disabled");
121+
} else if (brokenLock) {
122+
statusInfo = new StatusInfo(UNHEALTHY, "health check failed due to broken node lock");
120123
} else if (unhealthyPaths == null) {
121124
statusInfo = new StatusInfo(HEALTHY, "health check passed");
122125
} else {
@@ -150,7 +153,16 @@ public void run() {
150153

151154
private void monitorFSHealth() {
152155
Set<Path> currentUnhealthyPaths = null;
153-
for (Path path : nodeEnv.nodeDataPaths()) {
156+
Path[] paths = null;
157+
try {
158+
paths = nodeEnv.nodeDataPaths();
159+
} catch (IllegalStateException e) {
160+
logger.error("health check failed", e);
161+
brokenLock = true;
162+
return;
163+
}
164+
165+
for (Path path : paths) {
154166
long executionStartTime = currentTimeMillisSupplier.getAsLong();
155167
try {
156168
if (Files.exists(path)) {
@@ -176,6 +188,7 @@ private void monitorFSHealth() {
176188
}
177189
}
178190
unhealthyPaths = currentUnhealthyPaths;
191+
brokenLock = false;
179192
}
180193
}
181194
}

server/src/test/java/org/elasticsearch/monitor/fs/FsHealthServiceTests.java

Lines changed: 70 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
import java.io.OutputStream;
4343
import java.nio.channels.FileChannel;
4444
import java.nio.file.FileSystem;
45-
import java.nio.file.OpenOption;
4645
import java.nio.file.Path;
46+
import java.nio.file.OpenOption;
4747
import java.nio.file.attribute.FileAttribute;
4848
import java.util.Set;
4949
import java.util.concurrent.TimeUnit;
@@ -231,6 +231,36 @@ public void testFailsHealthOnSinglePathWriteFailure() throws IOException {
231231
}
232232
}
233233

234+
public void testFailsHealthOnUnexpectedLockFileSize() throws IOException {
235+
FileSystem fileSystem = PathUtils.getDefaultFileSystem();
236+
final Settings settings = Settings.EMPTY;
237+
TestThreadPool testThreadPool = new TestThreadPool(getClass().getName(), settings);
238+
FileSystemUnexpectedLockFileSizeProvider unexpectedLockFileSizeFileSystemProvider = new FileSystemUnexpectedLockFileSizeProvider(
239+
fileSystem, 1, testThreadPool);
240+
fileSystem = unexpectedLockFileSizeFileSystemProvider.getFileSystem(null);
241+
PathUtilsForTesting.installMock(fileSystem);
242+
final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
243+
try (NodeEnvironment env = newNodeEnvironment()) {
244+
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
245+
fsHealthService.new FsHealthMonitor().run();
246+
assertEquals(HEALTHY, fsHealthService.getHealth().getStatus());
247+
assertEquals("health check passed", fsHealthService.getHealth().getInfo());
248+
249+
// enabling unexpected file size injection
250+
unexpectedLockFileSizeFileSystemProvider.injectUnexpectedFileSize.set(true);
251+
252+
fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
253+
fsHealthService.new FsHealthMonitor().run();
254+
assertEquals(UNHEALTHY, fsHealthService.getHealth().getStatus());
255+
assertThat(fsHealthService.getHealth().getInfo(), is("health check failed due to broken node lock"));
256+
assertEquals(1, unexpectedLockFileSizeFileSystemProvider.getInjectedPathCount());
257+
} finally {
258+
unexpectedLockFileSizeFileSystemProvider.injectUnexpectedFileSize.set(false);
259+
PathUtilsForTesting.teardown();
260+
ThreadPool.terminate(testThreadPool, 500, TimeUnit.MILLISECONDS);
261+
}
262+
}
263+
234264
private static class FileSystemIOExceptionProvider extends FilterFileSystemProvider {
235265

236266
AtomicBoolean injectIOException = new AtomicBoolean();
@@ -254,7 +284,8 @@ public int getInjectedPathCount(){
254284
public OutputStream newOutputStream(Path path, OpenOption... options) throws IOException {
255285
if (injectIOException.get()){
256286
assert pathPrefix != null : "must set pathPrefix before starting disruptions";
257-
if (path.toString().startsWith(pathPrefix) && path.toString().endsWith(".es_temp_file")) {
287+
if (path.toString().startsWith(pathPrefix) && path.toString().
288+
endsWith(FsHealthService.FsHealthMonitor.TEMP_FILE_NAME)) {
258289
injectedPaths.incrementAndGet();
259290
throw new IOException("fake IOException");
260291
}
@@ -289,7 +320,8 @@ public FileChannel newFileChannel(Path path, Set<? extends OpenOption> options,
289320
public void force(boolean metaData) throws IOException {
290321
if (injectIOException.get()) {
291322
assert pathPrefix != null : "must set pathPrefix before starting disruptions";
292-
if (path.toString().startsWith(pathPrefix) && path.toString().endsWith(".es_temp_file")) {
323+
if (path.toString().startsWith(pathPrefix) && path.toString().
324+
endsWith(FsHealthService.FsHealthMonitor.TEMP_FILE_NAME)) {
293325
injectedPaths.incrementAndGet();
294326
throw new IOException("fake IOException");
295327
}
@@ -341,4 +373,39 @@ public void force(boolean metaData) throws IOException {
341373
};
342374
}
343375
}
376+
377+
private static class FileSystemUnexpectedLockFileSizeProvider extends FilterFileSystemProvider {
378+
379+
AtomicBoolean injectUnexpectedFileSize = new AtomicBoolean();
380+
AtomicInteger injectedPaths = new AtomicInteger();
381+
382+
private final long size;
383+
private final ThreadPool threadPool;
384+
385+
FileSystemUnexpectedLockFileSizeProvider(FileSystem inner, long size, ThreadPool threadPool) {
386+
super("disrupt_fs_health://", inner);
387+
this.size = size;
388+
this.threadPool = threadPool;
389+
}
390+
391+
public int getInjectedPathCount(){
392+
return injectedPaths.get();
393+
}
394+
395+
@Override
396+
public FileChannel newFileChannel(Path path, Set<? extends OpenOption> options, FileAttribute<?>... attrs) throws IOException {
397+
return new FilterFileChannel(super.newFileChannel(path, options, attrs)) {
398+
@Override
399+
public long size() throws IOException {
400+
if (injectUnexpectedFileSize.get()) {
401+
if (path.getFileName().toString().equals(NodeEnvironment.NODE_LOCK_FILENAME)) {
402+
injectedPaths.incrementAndGet();
403+
return size;
404+
}
405+
}
406+
return super.size();
407+
}
408+
};
409+
}
410+
}
344411
}

0 commit comments

Comments
 (0)