Skip to content

Commit af49823

Browse files
committed
HADOOP-12657. Add a option to skip newline on empty files with getMerge -nl. Contributed by Kanaka Kumar Avvaru.
(cherry picked from commit 061c05c)
1 parent e06c291 commit af49823

File tree

5 files changed

+55
-20
lines changed

5 files changed

+55
-20
lines changed

hadoop-common-project/hadoop-common/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ Release 2.8.0 - UNRELEASED
5555

5656
HADOOP-12366. expose calculated paths (aw)
5757

58+
HADOOP-12657. Add a option to skip newline on empty files with getMerge -nl.
59+
(Kanaka Kumar Avvaru via aajisaka)
60+
5861
IMPROVEMENTS
5962

6063
HADOOP-12458. Retries is typoed to spell Retires in parts of

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CopyCommands.java

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -53,24 +53,29 @@ public static void registerCommands(CommandFactory factory) {
5353
/** merge multiple files together */
5454
public static class Merge extends FsCommand {
5555
public static final String NAME = "getmerge";
56-
public static final String USAGE = "[-nl] <src> <localdst>";
56+
public static final String USAGE = "[-nl] [-skip-empty-file] "
57+
+ "<src> <localdst>";
5758
public static final String DESCRIPTION =
58-
"Get all the files in the directories that " +
59-
"match the source file pattern and merge and sort them to only " +
60-
"one file on local fs. <src> is kept.\n" +
61-
"-nl: Add a newline character at the end of each file.";
59+
"Get all the files in the directories that "
60+
+ "match the source file pattern and merge and sort them to only "
61+
+ "one file on local fs. <src> is kept.\n"
62+
+ "-nl: Add a newline character at the end of each file.\n"
63+
+ "-skip-empty-file: Do not add new line character for empty file.";
6264

6365
protected PathData dst = null;
6466
protected String delimiter = null;
67+
private boolean skipEmptyFileDelimiter;
6568
protected List<PathData> srcs = null;
6669

6770
@Override
6871
protected void processOptions(LinkedList<String> args) throws IOException {
6972
try {
70-
CommandFormat cf = new CommandFormat(2, Integer.MAX_VALUE, "nl");
73+
CommandFormat cf = new CommandFormat(2, Integer.MAX_VALUE, "nl",
74+
"skip-empty-file");
7175
cf.parse(args);
7276

7377
delimiter = cf.getOpt("nl") ? "\n" : null;
78+
skipEmptyFileDelimiter = cf.getOpt("skip-empty-file");
7479

7580
dst = new PathData(new URI(args.removeLast()), getConf());
7681
if (dst.exists && dst.stat.isDirectory()) {
@@ -92,21 +97,26 @@ protected void processArguments(LinkedList<PathData> items)
9297
FSDataOutputStream out = dst.fs.create(dst.path);
9398
try {
9499
for (PathData src : srcs) {
95-
FSDataInputStream in = src.fs.open(src.path);
96-
try {
97-
IOUtils.copyBytes(in, out, getConf(), false);
98-
if (delimiter != null) {
99-
out.write(delimiter.getBytes("UTF-8"));
100+
if (src.stat.getLen() != 0) {
101+
try (FSDataInputStream in = src.fs.open(src.path)) {
102+
IOUtils.copyBytes(in, out, getConf(), false);
103+
writeDelimiter(out);
100104
}
101-
} finally {
102-
in.close();
105+
} else if (!skipEmptyFileDelimiter) {
106+
writeDelimiter(out);
103107
}
104108
}
105109
} finally {
106110
out.close();
107-
}
111+
}
108112
}
109-
113+
114+
private void writeDelimiter(FSDataOutputStream out) throws IOException {
115+
if (delimiter != null) {
116+
out.write(delimiter.getBytes("UTF-8"));
117+
}
118+
}
119+
110120
@Override
111121
protected void processNonexistentPath(PathData item) throws IOException {
112122
exitCode = 1; // flag that a path is bad

hadoop-common-project/hadoop-common/src/site/markdown/FileSystemShell.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ getmerge
375375
Usage: `hadoop fs -getmerge [-nl] <src> <localdst>`
376376

377377
Takes a source directory and a destination file as input and concatenates files in src into the destination local file. Optionally -nl can be set to enable adding a newline character (LF) at the end of each file.
378+
-skip-empty-file can be used to avoid unwanted newline characters in case of empty files.
378379

379380
Examples:
380381

hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestFsShellCopy.java

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -318,14 +318,16 @@ public void testCopyMerge() throws Exception {
318318
Path f1 = new Path(root, "f1");
319319
Path f2 = new Path(root, "f2");
320320
Path f3 = new Path(root, "f3");
321+
Path empty = new Path(root, "empty");
321322
Path fnf = new Path(root, "fnf");
322323
Path d = new Path(root, "dir");
323324
Path df1 = new Path(d, "df1");
324325
Path df2 = new Path(d, "df2");
325326
Path df3 = new Path(d, "df3");
326327

327328
createFile(f1, f2, f3, df1, df2, df3);
328-
329+
createEmptyFile(empty);
330+
329331
int exit;
330332
// one file, kind of silly
331333
exit = shell.run(new String[]{
@@ -366,6 +368,13 @@ public void testCopyMerge() throws Exception {
366368
assertEquals(0, exit);
367369
assertEquals("f1\nf2\n", readFile("out"));
368370

371+
exit = shell.run(new String[]{
372+
"-getmerge", "-nl", "-skip-empty-file",
373+
f1.toString(), f2.toString(), empty.toString(),
374+
"out" });
375+
assertEquals(0, exit);
376+
assertEquals("f1\nf2\n", readFile("out"));
377+
369378
// glob three files
370379
shell.run(new String[]{
371380
"-getmerge", "-nl",
@@ -374,13 +383,13 @@ public void testCopyMerge() throws Exception {
374383
assertEquals(0, exit);
375384
assertEquals("f1\nf2\nf3\n", readFile("out"));
376385

377-
// directory with 3 files, should skip subdir
386+
// directory with 1 empty + 3 non empty files, should skip subdir
378387
shell.run(new String[]{
379388
"-getmerge", "-nl",
380389
root.toString(),
381390
"out" });
382391
assertEquals(0, exit);
383-
assertEquals("f1\nf2\nf3\n", readFile("out"));
392+
assertEquals("\nf1\nf2\nf3\n", readFile("out"));
384393

385394
// subdir
386395
shell.run(new String[]{
@@ -538,7 +547,14 @@ private void createFile(Path ... paths) throws IOException {
538547
out.close();
539548
}
540549
}
541-
550+
551+
private void createEmptyFile(Path ... paths) throws IOException {
552+
for (Path path : paths) {
553+
FSDataOutputStream out = lfs.create(path);
554+
out.close();
555+
}
556+
}
557+
542558
private String readFile(String out) throws IOException {
543559
Path path = new Path(out);
544560
FileStatus stat = lfs.getFileStatus(path);

hadoop-common-project/hadoop-common/src/test/resources/testConf.xml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,7 @@
601601
<comparators>
602602
<comparator>
603603
<type>RegexpComparator</type>
604-
<expected-output>^-getmerge \[-nl\] &lt;src&gt; &lt;localdst&gt; :\s*</expected-output>
604+
<expected-output>^-getmerge \[-nl\] \[-skip-empty-file\] &lt;src&gt; &lt;localdst&gt; :\s*</expected-output>
605605
</comparator>
606606
<comparator>
607607
<type>RegexpComparator</type>
@@ -615,6 +615,11 @@
615615
<type>RegexpComparator</type>
616616
<expected-output>^( |\t)*-nl\s+Add a newline character at the end of each file.( )*</expected-output>
617617
</comparator>
618+
<comparator>
619+
<type>RegexpComparator</type>
620+
<expected-output>^( |\t)*-skip-empty-file\s+Do not add new line character for empty file.( )*</expected-output>
621+
</comparator>
622+
618623
</comparators>
619624
</test>
620625

0 commit comments

Comments
 (0)