Skip to content

Commit 1921e94

Browse files
committed
HADOOP-16458. LocatedFileStatusFetcher.getFileStatuses failing intermittently with S3
Contributed by Steve Loughran. Includes -S3A glob scans don't bother trying to resolve symlinks -stack traces don't get lost in getFileStatuses() when exceptions are wrapped -debug level logging of what is up in Globber -Contains HADOOP-13373. Add S3A implementation of FSMainOperationsBaseTest. -ITestRestrictedReadAccess tests incomplete read access to files. This adds a builder API for constructing globbers which other stores can use so that they too can skip symlink resolution when not needed. Change-Id: I23bcdb2783d6bd77cf168fdc165b1b4b334d91c7
1 parent 918b470 commit 1921e94

File tree

13 files changed

+1089
-50
lines changed

13 files changed

+1089
-50
lines changed

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2064,7 +2064,12 @@ public FileStatus[] listStatus(Path[] files, PathFilter filter)
20642064
* @throws IOException IO failure
20652065
*/
20662066
public FileStatus[] globStatus(Path pathPattern) throws IOException {
2067-
return new Globber(this, pathPattern, DEFAULT_FILTER).glob();
2067+
return Globber.createGlobber(this)
2068+
.withPathPattern(pathPattern)
2069+
.withPathFiltern(DEFAULT_FILTER)
2070+
.withResolveSymlinks(true)
2071+
.build()
2072+
.glob();
20682073
}
20692074

20702075
/**

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java

Lines changed: 186 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,24 @@
2525

2626
import org.apache.hadoop.classification.InterfaceAudience;
2727
import org.apache.hadoop.classification.InterfaceStability;
28+
import org.apache.hadoop.util.DurationInfo;
2829

2930
import org.apache.htrace.core.TraceScope;
3031
import org.apache.htrace.core.Tracer;
3132
import org.slf4j.Logger;
3233
import org.slf4j.LoggerFactory;
3334

35+
import static com.google.common.base.Preconditions.checkNotNull;
36+
37+
/**
38+
* Implementation of {@link FileSystem#globStatus(Path, PathFilter)}.
39+
* This has historically been package-private; it has been opened
40+
* up for object stores within the {@code hadoop-*} codebase ONLY.
41+
* It could be expanded for external store implementations in future.
42+
*/
3443
@InterfaceAudience.Private
3544
@InterfaceStability.Unstable
36-
class Globber {
45+
public class Globber {
3746
public static final Logger LOG =
3847
LoggerFactory.getLogger(Globber.class.getName());
3948

@@ -42,21 +51,62 @@ class Globber {
4251
private final Path pathPattern;
4352
private final PathFilter filter;
4453
private final Tracer tracer;
45-
46-
public Globber(FileSystem fs, Path pathPattern, PathFilter filter) {
54+
private final boolean resolveSymlinks;
55+
56+
Globber(FileSystem fs, Path pathPattern, PathFilter filter) {
57+
this.fs = fs;
58+
this.fc = null;
59+
this.pathPattern = pathPattern;
60+
this.filter = filter;
61+
this.tracer = FsTracer.get(fs.getConf());
62+
this.resolveSymlinks = true;
63+
}
64+
65+
Globber(FileContext fc, Path pathPattern, PathFilter filter) {
66+
this.fs = null;
67+
this.fc = fc;
68+
this.pathPattern = pathPattern;
69+
this.filter = filter;
70+
this.tracer = fc.getTracer();
71+
this.resolveSymlinks = true;
72+
}
73+
74+
/**
75+
* Filesystem constructor for use by {@link GlobBuilder}.
76+
* @param fs filesystem
77+
* @param pathPattern path pattern
78+
* @param filter optional filter
79+
* @param resolveSymlinks should symlinks be resolved.
80+
*/
81+
private Globber(FileSystem fs, Path pathPattern, PathFilter filter,
82+
boolean resolveSymlinks) {
4783
this.fs = fs;
4884
this.fc = null;
4985
this.pathPattern = pathPattern;
5086
this.filter = filter;
87+
this.resolveSymlinks = resolveSymlinks;
5188
this.tracer = FsTracer.get(fs.getConf());
89+
LOG.debug("Created Globber for path={}, symlinks={}",
90+
pathPattern, resolveSymlinks);
5291
}
5392

54-
public Globber(FileContext fc, Path pathPattern, PathFilter filter) {
93+
/**
94+
* File Context constructor for use by {@link GlobBuilder}.
95+
* @param fc file context
96+
* @param pathPattern path pattern
97+
* @param filter optional filter
98+
* @param resolveSymlinks should symlinks be resolved.
99+
*/
100+
private Globber(FileContext fc, Path pathPattern, PathFilter filter,
101+
boolean resolveSymlinks) {
55102
this.fs = null;
56103
this.fc = fc;
57104
this.pathPattern = pathPattern;
58105
this.filter = filter;
106+
this.resolveSymlinks = resolveSymlinks;
59107
this.tracer = fc.getTracer();
108+
LOG.debug("Created Globber path={}, symlinks={}",
109+
pathPattern, resolveSymlinks);
60110
}
61111

62112
private FileStatus getFileStatus(Path path) throws IOException {
@@ -67,6 +117,7 @@ private FileStatus getFileStatus(Path path) throws IOException {
67117
return fc.getFileStatus(path);
68118
}
69119
} catch (FileNotFoundException e) {
120+
LOG.debug("getFileStatus({}) failed; returning null", path, e);
70121
return null;
71122
}
72123
}
@@ -79,6 +130,7 @@ private FileStatus[] listStatus(Path path) throws IOException {
79130
return fc.util().listStatus(path);
80131
}
81132
} catch (FileNotFoundException e) {
133+
LOG.debug("listStatus({}) failed; returning empty array", path, e);
82134
return new FileStatus[0];
83135
}
84136
}
@@ -107,7 +159,7 @@ private static String unescapePathComponent(String name) {
107159
*/
108160
private static List<String> getPathComponents(String path)
109161
throws IOException {
110-
ArrayList<String> ret = new ArrayList<String>();
162+
ArrayList<String> ret = new ArrayList<>();
111163
for (String component : path.split(Path.SEPARATOR)) {
112164
if (!component.isEmpty()) {
113165
ret.add(component);
@@ -145,7 +197,8 @@ private String authorityFromPath(Path path) throws IOException {
145197
public FileStatus[] glob() throws IOException {
146198
TraceScope scope = tracer.newScope("Globber#glob");
147199
scope.addKVAnnotation("pattern", pathPattern.toUri().getPath());
148-
try {
200+
try (DurationInfo ignored = new DurationInfo(LOG, false,
201+
"glob %s", pathPattern)) {
149202
return doGlob();
150203
} finally {
151204
scope.close();
@@ -164,24 +217,26 @@ private FileStatus[] doGlob() throws IOException {
164217
String pathPatternString = pathPattern.toUri().getPath();
165218
List<String> flattenedPatterns = GlobExpander.expand(pathPatternString);
166219

220+
LOG.debug("Filesystem glob {}", pathPatternString);
167221
// Now loop over all flattened patterns. In every case, we'll be trying to
168222
// match them to entries in the filesystem.
169223
ArrayList<FileStatus> results =
170-
new ArrayList<FileStatus>(flattenedPatterns.size());
224+
new ArrayList<>(flattenedPatterns.size());
171225
boolean sawWildcard = false;
172226
for (String flatPattern : flattenedPatterns) {
173227
// Get the absolute path for this flattened pattern. We couldn't do
174228
// this prior to flattening because of patterns like {/,a}, where which
175229
// path you go down influences how the path must be made absolute.
176230
Path absPattern = fixRelativePart(new Path(
177231
flatPattern.isEmpty() ? Path.CUR_DIR : flatPattern));
232+
LOG.debug("Pattern: {}", absPattern);
178233
// Now we break the flattened, absolute pattern into path components.
179234
// For example, /a/*/c would be broken into the list [a, *, c]
180235
List<String> components =
181236
getPathComponents(absPattern.toUri().getPath());
182237
// Starting out at the root of the filesystem, we try to match
183238
// filesystem entries against pattern components.
184-
ArrayList<FileStatus> candidates = new ArrayList<FileStatus>(1);
239+
ArrayList<FileStatus> candidates = new ArrayList<>(1);
185240
// To get the "real" FileStatus of root, we'd have to do an expensive
186241
// RPC to the NameNode. So we create a placeholder FileStatus which has
187242
// the correct path, but defaults for the rest of the information.
@@ -206,12 +261,13 @@ private FileStatus[] doGlob() throws IOException {
206261
for (int componentIdx = 0; componentIdx < components.size();
207262
componentIdx++) {
208263
ArrayList<FileStatus> newCandidates =
209-
new ArrayList<FileStatus>(candidates.size());
264+
new ArrayList<>(candidates.size());
210265
GlobFilter globFilter = new GlobFilter(components.get(componentIdx));
211266
String component = unescapePathComponent(components.get(componentIdx));
212267
if (globFilter.hasPattern()) {
213268
sawWildcard = true;
214269
}
270+
LOG.debug("Component {}, patterned={}", component, sawWildcard);
215271
if (candidates.isEmpty() && sawWildcard) {
216272
// Optimization: if there are no more candidates left, stop examining
217273
// the path components. We can only do this if we've already seen
@@ -245,19 +301,31 @@ private FileStatus[] doGlob() throws IOException {
245301
// incorrectly conclude that /a/b was a file and should not match
246302
// /a/*/*. So we use getFileStatus of the path we just listed to
247303
// disambiguate.
248-
Path path = candidate.getPath();
249-
FileStatus status = getFileStatus(path);
250-
if (status == null) {
251-
// null means the file was not found
252-
LOG.warn("File/directory {} not found:"
253-
+ " it may have been deleted."
254-
+ " If this is an object store, this can be a sign of"
255-
+ " eventual consistency problems.",
256-
path);
257-
continue;
258-
}
259-
if (!status.isDirectory()) {
260-
continue;
304+
if (resolveSymlinks) {
305+
LOG.debug("listStatus found one entry; disambiguating {}",
306+
children[0]);
307+
Path path = candidate.getPath();
308+
FileStatus status = getFileStatus(path);
309+
if (status == null) {
310+
// null means the file was not found
311+
LOG.warn("File/directory {} not found:"
312+
+ " it may have been deleted."
313+
+ " If this is an object store, this can be a sign of"
314+
+ " eventual consistency problems.",
315+
path);
316+
continue;
317+
}
318+
if (!status.isDirectory()) {
319+
LOG.debug("Resolved entry is a file; skipping: {}", status);
320+
continue;
321+
}
322+
} else {
323+
// there's no symlinks in this store, so no need to issue
324+
// another call, just see if the result is a directory or a file
325+
if (children[0].getPath().equals(candidate.getPath())) {
326+
// the listing status is of a file
327+
continue;
328+
}
261329
}
262330
}
263331
for (FileStatus child : children) {
@@ -312,6 +380,8 @@ private FileStatus[] doGlob() throws IOException {
312380
*/
313381
if ((!sawWildcard) && results.isEmpty() &&
314382
(flattenedPatterns.size() <= 1)) {
383+
LOG.debug("No matches found and there was no wildcard in the path {}",
384+
pathPattern);
315385
return null;
316386
}
317387
/*
@@ -324,4 +394,98 @@ private FileStatus[] doGlob() throws IOException {
324394
Arrays.sort(ret);
325395
return ret;
326396
}
397+
398+
/**
399+
* Create a builder for a Globber, bonded to the specific filesystem.
400+
* @param filesystem filesystem
401+
* @return the builder to finish configuring.
402+
*/
403+
public static GlobBuilder createGlobber(FileSystem filesystem) {
404+
return new GlobBuilder(filesystem);
405+
}
406+
407+
/**
408+
* Create a builder for a Globber, bonded to the specific file
409+
* context.
410+
* @param fileContext file context.
411+
* @return the builder to finish configuring.
412+
*/
413+
public static GlobBuilder createGlobber(FileContext fileContext) {
414+
return new GlobBuilder(fileContext);
415+
}
416+
417+
/**
418+
* Builder for Globber instances.
419+
*/
420+
@InterfaceAudience.Private
421+
public static class GlobBuilder {
422+
423+
private final FileSystem fs;
424+
425+
private final FileContext fc;
426+
427+
private Path pathPattern;
428+
429+
private PathFilter filter;
430+
431+
private boolean resolveSymlinks = true;
432+
433+
/**
434+
* Construct bonded to a file context.
435+
* @param fc file context.
436+
*/
437+
public GlobBuilder(final FileContext fc) {
438+
this.fs = null;
439+
this.fc = checkNotNull(fc);
440+
}
441+
442+
/**
443+
* Construct bonded to a filesystem.
444+
* @param fs file system.
445+
*/
446+
public GlobBuilder(final FileSystem fs) {
447+
this.fs = checkNotNull(fs);
448+
this.fc = null;
449+
}
450+
451+
/**
452+
* Set the path pattern.
453+
* @param pattern pattern to use.
454+
* @return the builder
455+
*/
456+
public GlobBuilder withPathPattern(Path pattern) {
457+
pathPattern = pattern;
458+
return this;
459+
}
460+
461+
/**
462+
* Set the path filter.
463+
* @param pathFilter filter
464+
* @return the builder
465+
*/
466+
public GlobBuilder withPathFiltern(PathFilter pathFilter) {
467+
filter = pathFilter;
468+
return this;
469+
}
470+
471+
/**
472+
* Set the symlink resolution policy.
473+
* @param resolve resolution flag.
474+
* @return the builder
475+
*/
476+
public GlobBuilder withResolveSymlinks(boolean resolve) {
477+
resolveSymlinks = resolve;
478+
return this;
479+
}
480+
481+
/**
482+
* Build the Globber.
483+
* @return a new instance.
484+
*/
485+
public Globber build() {
486+
return fs != null
487+
? new Globber(fs, pathPattern, filter, resolveSymlinks)
488+
: new Globber(fc, pathPattern, filter, resolveSymlinks);
489+
}
490+
}
327491
}

hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/LambdaTestUtils.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,9 @@ private static String robustToString(Object o) {
575575
if (o == null) {
576576
return NULL_RESULT;
577577
} else {
578+
if (o instanceof String) {
579+
return '"' + (String)o + '"';
580+
}
578581
try {
579582
return o.toString();
580583
} catch (Exception e) {

hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.apache.hadoop.mapred;
2020

2121
import java.io.IOException;
22+
import java.io.InterruptedIOException;
2223
import java.util.ArrayList;
2324
import java.util.Collections;
2425
import java.util.Comparator;
@@ -250,7 +251,9 @@ protected FileStatus[] listStatus(JobConf job) throws IOException {
250251
job, dirs, recursive, inputFilter, false);
251252
locatedFiles = locatedFileStatusFetcher.getFileStatuses();
252253
} catch (InterruptedException e) {
253-
throw new IOException("Interrupted while getting file statuses");
254+
throw (IOException)
255+
new InterruptedIOException("Interrupted while getting file statuses")
256+
.initCause(e);
254257
}
255258
result = Iterables.toArray(locatedFiles, FileStatus.class);
256259
}

hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/InvalidInputException.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,14 @@ public class InvalidInputException extends IOException {
3838

3939
/**
4040
* Create the exception with the given list.
41+
* The first element of the list is used as the init cause value.
4142
* @param probs the list of problems to report. this list is not copied.
4243
*/
4344
public InvalidInputException(List<IOException> probs) {
4445
problems = probs;
46+
if (!probs.isEmpty()) {
47+
initCause(probs.get(0));
48+
}
4549
}
4650

4751
/**

0 commit comments

Comments
 (0)