Skip to content

Commit 16fb3e0

Browse files
committed
#46 Counts row written correctly when distinct enabled, both for csv and regular sort
1 parent 54bd7b4 commit 16fb3e0

File tree

5 files changed

+78
-25
lines changed

5 files changed

+78
-25
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import com.google.code.externalsorting.ExternalSort;
2323
//... inputfile: input file name
2424
//... outputfile: output file name
2525
// next command sorts the lines from inputfile to outputfile
26-
ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(new File(inputfile)), new File(outputfile));
26+
int numLinesWritten = ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(new File(inputfile)), new File(outputfile));
2727
// you can also provide a custom string comparator, see API
2828
```
2929

@@ -56,7 +56,7 @@ ArrayList<CSVRecord> header = new ArrayList<CSVRecord>();
5656
// next two lines sort the lines from inputfile to outputfile
5757
List<File> sortInBatch = CsvExternalSort.sortInBatch(file, null, sortOptions, header);
5858
// at this point you can access header if you'd like.
59-
CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);
59+
int numWrittenLines = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);
6060

6161
```
6262

src/main/java/com/google/code/externalsorting/ExternalSort.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -226,15 +226,15 @@ public int compare(IOStringStack i,
226226
pq.add(bfb);
227227
}
228228
}
229-
long rowcounter = 0;
229+
long numLinesWritten = 0;
230230
try {
231231
if (!distinct) {
232232
while (pq.size() > 0) {
233233
IOStringStack bfb = pq.poll();
234234
String r = bfb.pop();
235235
fbw.write(r);
236236
fbw.newLine();
237-
++rowcounter;
237+
++numLinesWritten;
238238
if (bfb.empty()) {
239239
bfb.close();
240240
} else {
@@ -248,7 +248,7 @@ public int compare(IOStringStack i,
248248
lastLine = bfb.pop();
249249
fbw.write(lastLine);
250250
fbw.newLine();
251-
++rowcounter;
251+
++numLinesWritten;
252252
if (bfb.empty()) {
253253
bfb.close();
254254
} else {
@@ -263,8 +263,8 @@ public int compare(IOStringStack i,
263263
fbw.write(r);
264264
fbw.newLine();
265265
lastLine = r;
266+
++numLinesWritten;
266267
}
267-
++rowcounter;
268268
if (bfb.empty()) {
269269
bfb.close();
270270
} else {
@@ -278,7 +278,7 @@ public int compare(IOStringStack i,
278278
bfb.close();
279279
}
280280
}
281-
return rowcounter;
281+
return numLinesWritten;
282282

283283
}
284284

@@ -460,11 +460,11 @@ public static long mergeSortedFiles(List<File> files, BufferedWriter fbw,
460460
BinaryFileBuffer bfb = new BinaryFileBuffer(br);
461461
bfbs.add(bfb);
462462
}
463-
long rowcounter = mergeSortedFiles(fbw, cmp, distinct, bfbs);
463+
long numLinesWritten = mergeSortedFiles(fbw, cmp, distinct, bfbs);
464464
for (File f : files) {
465465
f.delete();
466466
}
467-
return rowcounter;
467+
return numLinesWritten;
468468
}
469469

470470
/**

src/main/java/com/google/code/externalsorting/csv/CsvExternalSort.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ public int compare(CSVRecordBuffer i, CSVRecordBuffer j) {
8181
for (CSVRecordBuffer bfb : bfbs)
8282
if (!bfb.empty())
8383
pq.add(bfb);
84-
int rowcounter = 0;
84+
int numWrittenLines = 0;
8585
CSVPrinter printer = new CSVPrinter(fbw, sortOptions.getFormat());
8686
if(! sortOptions.isSkipHeader()) {
8787
for(CSVRecord r: header) {
@@ -98,8 +98,8 @@ public int compare(CSVRecordBuffer i, CSVRecordBuffer j) {
9898
} else {
9999
printer.printRecord(r);
100100
lastLine = r;
101+
++numWrittenLines;
101102
}
102-
++rowcounter;
103103
if (bfb.empty()) {
104104
bfb.close();
105105
} else {
@@ -113,7 +113,7 @@ public int compare(CSVRecordBuffer i, CSVRecordBuffer j) {
113113
bfb.close();
114114
}
115115

116-
return rowcounter;
116+
return numWrittenLines;
117117
}
118118

119119
public static int mergeSortedFiles(List<File> files, File outputfile, final CsvSortOptions sortOptions,
@@ -131,14 +131,14 @@ public static int mergeSortedFiles(List<File> files, File outputfile, final CsvS
131131
BufferedWriter fbw = new BufferedWriter(
132132
new OutputStreamWriter(new FileOutputStream(outputfile, append), sortOptions.getCharset()));
133133

134-
int rowcounter = mergeSortedFiles(fbw, sortOptions, bfbs, header);
134+
int numWrittenLines = mergeSortedFiles(fbw, sortOptions, bfbs, header);
135135
for (File f : files) {
136136
if (!f.delete()) {
137137
LOG.log(Level.WARNING, String.format("The file %s was not deleted", f.getName()));
138138
}
139139
}
140140

141-
return rowcounter;
141+
return numWrittenLines;
142142
}
143143

144144
public static List<File> sortInBatch(long size_in_byte, final BufferedReader fbr, final File tmpdirectory,

src/test/java/com/google/code/externalsorting/ExternalSortTest.java

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package com.google.code.externalsorting;
22

3+
import static com.google.code.externalsorting.ExternalSort.defaultcomparator;
34
import static org.junit.Assert.assertArrayEquals;
45
import static org.junit.Assert.assertEquals;
56
import static org.junit.Assert.assertNotNull;
@@ -206,15 +207,17 @@ public int compare(String o1, String o2) {
206207
};
207208
File out = File.createTempFile("test_results", ".tmp", null);
208209
out.deleteOnExit();
209-
ExternalSort.mergeSortedFiles(this.fileList, out, cmp,
210-
Charset.defaultCharset(), true);
210+
long numLinesWritten = ExternalSort.mergeSortedFiles(this.fileList, out, cmp,
211+
Charset.defaultCharset(), true);
211212

212213
List<String> result = new ArrayList<>();
213214
try (BufferedReader bf = new BufferedReader(new FileReader(out))) {
214215
while ((line = bf.readLine()) != null) {
215216
result.add(line);
216217
}
217218
}
219+
220+
assertEquals(11, numLinesWritten);
218221
assertArrayEquals(Arrays.toString(result.toArray()), EXPECTED_MERGE_DISTINCT_RESULTS,
219222
result.toArray());
220223
}
@@ -399,9 +402,22 @@ public static void writeStringToFile(File f, String s) throws IOException {
399402
public void sortVeryLargeFile() throws IOException {
400403
final Path veryLargeFile = getTestFile();
401404
final Path outputFile = Files.createTempFile("Merged-File", ".tmp");
402-
final long sortedLines = ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(veryLargeFile.toFile()), outputFile.toFile());
405+
final long numLinesWritten = ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(veryLargeFile.toFile()), outputFile.toFile());
403406
final long expectedLines = 2148L * 1000000L;
404-
assertEquals(expectedLines, sortedLines);
407+
assertEquals(expectedLines, numLinesWritten);
408+
}
409+
410+
@Ignore("This test takes too long to execute")
411+
@Test
412+
public void sortVeryLargeFileWhenDistinctEnabled() throws IOException {
413+
boolean distinctEnabled = true;
414+
final Path veryLargeFile = getTestFile();
415+
final File outputFile = Files.createTempFile("Merged-File", ".tmp").toFile();
416+
List<File> veryLargeSortBatch = ExternalSort.sortInBatch(veryLargeFile.toFile());
417+
418+
long numLinesWritten = ExternalSort.mergeSortedFiles(veryLargeSortBatch, outputFile, defaultcomparator, distinctEnabled);
419+
420+
assertEquals(1 /* 😸 */, numLinesWritten);
405421
}
406422

407423
/**

src/test/java/com/google/code/externalsorting/csv/CsvExternalSortTest.java

Lines changed: 44 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import java.io.File;
1010
import java.io.FileReader;
1111
import java.io.IOException;
12-
import java.lang.reflect.Field;
1312
import java.nio.charset.Charset;
1413
import java.nio.charset.StandardCharsets;
1514
import java.nio.file.Files;
@@ -21,6 +20,7 @@
2120
import java.util.Map;
2221

2322
import static org.junit.Assert.assertEquals;
23+
import static org.junit.Assert.assertNotEquals;
2424

2525

2626
public class CsvExternalSortTest {
@@ -97,7 +97,7 @@ public void testIssue44() throws Exception {
9797
List<File> sortInBatch = CsvExternalSort.sortInBatch(file, null, sortOptions, header);
9898

9999

100-
int mergeSortedFiles = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);
100+
CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);
101101

102102
List<String> lines = Files.readAllLines(Paths.get(outputfile.getPath()), StandardCharsets.UTF_8);
103103
for(String a : lines) {
@@ -133,9 +133,9 @@ public void testNonLatin() throws Exception {
133133

134134
assertEquals(1, sortInBatch.size());
135135

136-
int mergeSortedFiles = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);
136+
int numLinesWritten = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);
137137

138-
assertEquals(5, mergeSortedFiles);
138+
assertEquals(5, numLinesWritten);
139139

140140
List<String> lines = Files.readAllLines(Paths.get(outputfile.getPath()), StandardCharsets.UTF_8);
141141

@@ -175,9 +175,9 @@ public void testCVSFormat() throws Exception {
175175

176176
assertEquals(1, sortInBatch.size());
177177

178-
int mergeSortedFiles = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, false, header);
178+
int numLinesWritten = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, false, header);
179179

180-
assertEquals(4, mergeSortedFiles);
180+
assertEquals(4, numLinesWritten);
181181

182182
List<String> lines = Files.readAllLines(outputfile.toPath());
183183

@@ -210,7 +210,7 @@ public void testMultiLineFileWthHeader() throws IOException, ClassNotFoundExcept
210210

211211
assertEquals(1, sortInBatch.size());
212212

213-
int mergeSortedFiles = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);
213+
CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);
214214

215215
List<String> lines = Files.readAllLines(outputfile.toPath(), sortOptions.getCharset());
216216

@@ -223,6 +223,43 @@ public void testMultiLineFileWthHeader() throws IOException, ClassNotFoundExcept
223223

224224
}
225225

226+
@Test
227+
public void testNumLinesWrittenIfDistinctEnabled() throws IOException, ClassNotFoundException {
228+
boolean distinctEnabled = true;
229+
String path = this.getClass().getClassLoader().getResource(FILE_CSV).getPath();
230+
File file = new File(path);
231+
outputfile = new File("outputSort1.csv");
232+
233+
Comparator<CSVRecord> comparator = Comparator.comparing(op -> op.get(0));
234+
235+
CsvSortOptions sortOptions = new CsvSortOptions
236+
.Builder(comparator, CsvExternalSort.DEFAULTMAXTEMPFILES, CsvExternalSort.estimateAvailableMemory())
237+
.charset(Charset.defaultCharset())
238+
.distinct(distinctEnabled)
239+
.numHeader(1)
240+
.skipHeader(true)
241+
.format(CSVFormat.DEFAULT)
242+
.build();
243+
ArrayList<CSVRecord> header = new ArrayList<CSVRecord>();
244+
245+
List<File> sortInBatch = CsvExternalSort.sortInBatch(file, null, sortOptions, header);
246+
247+
int numLinesWritten = CsvExternalSort.mergeSortedFiles(sortInBatch, outputfile, sortOptions, true, header);
248+
249+
BufferedReader reader = new BufferedReader(new FileReader(outputfile));
250+
251+
assertEquals(1, sortInBatch.size());
252+
assertEquals(3, numLinesWritten);
253+
254+
String firstLine = reader.readLine();
255+
assertEquals("6,this wont work in other systems,3", firstLine);
256+
257+
String secondLine = reader.readLine();
258+
assertNotEquals(firstLine, secondLine);
259+
260+
reader.close();
261+
}
262+
226263
@After
227264
public void onTearDown() {
228265
if(outputfile.exists()) {

0 commit comments

Comments
 (0)