Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[INLONG-10768][Sort] Csv utils support specified the max split field size #10769

Merged
merged 4 commits into from
Aug 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,19 @@ public static String[][] splitCsv(
return splitCsv(text, delimiter, escapeChar, quoteChar, lineDelimiter, false);
}

/**
* @see StringUtils#splitCsv(String, Character, Character, Character, Character, boolean, Integer)
*/
public static String[][] splitCsv(
@Nonnull String text,
@Nonnull Character delimiter,
@Nullable Character escapeChar,
@Nullable Character quoteChar,
@Nullable Character lineDelimiter,
boolean deleteHeadDelimiter) {
return splitCsv(text, delimiter, escapeChar, quoteChar, lineDelimiter, deleteHeadDelimiter, null);
}

/**
* Splits the csv text, which may contains multiple lines of data.
*
Expand All @@ -402,6 +415,7 @@ public static String[][] splitCsv(
* @param lineDelimiter The delimiter between lines, e.g. '\n'.
* @param deleteHeadDelimiter If true and the leading character of a line
* is a delimiter, it will be ignored.
* @param maxFieldSize The max filed size of one single line
* @return A 2-D String array representing the parsed data, where the 1st
* dimension is row and the 2nd dimension is column.
*/
Expand All @@ -411,9 +425,16 @@ public static String[][] splitCsv(
@Nullable Character escapeChar,
@Nullable Character quoteChar,
@Nullable Character lineDelimiter,
boolean deleteHeadDelimiter) {
boolean deleteHeadDelimiter,
@Nullable Integer maxFieldSize) {
if (maxFieldSize != null && maxFieldSize <= 0) {
return new String[0][];
}

List<String[]> lines = new ArrayList<>();
List<String> fields = new ArrayList<>();
int splittedSize = 0;
int lastFieldStartIndex = 0;

StringBuilder stringBuilder = new StringBuilder();
int state = STATE_NORMAL;
Expand All @@ -431,6 +452,14 @@ public static String[][] splitCsv(
String field = stringBuilder.toString();
fields.add(field);
stringBuilder.setLength(0);

splittedSize++;
// if the last field, mark the last filed start index
if (maxFieldSize != null && splittedSize == maxFieldSize - 1) {
if (i + 1 < text.length()) {
lastFieldStartIndex = i + 1;
}
}
break;
case STATE_ESCAPING:
stringBuilder.append(ch);
Expand Down Expand Up @@ -471,10 +500,19 @@ public static String[][] splitCsv(
case STATE_NORMAL:
String field = stringBuilder.toString();
fields.add(field);
lines.add(fields.toArray(new String[0]));

// if the max field size < the real field size,
// remove the extra fields and copy the latest field from lastFieldStartIndex to current index
if (maxFieldSize != null && fields.size() > maxFieldSize) {
fields = replaceLastField(fields, maxFieldSize, text, lastFieldStartIndex, i);
}
// reset the lastFieldStartIndex for new line
lastFieldStartIndex = i + 1;

lines.add(fields.toArray(new String[0]));
stringBuilder.setLength(0);
fields.clear();
splittedSize = 0;
break;
case STATE_ESCAPING:
stringBuilder.append(ch);
Expand All @@ -498,6 +536,11 @@ public static String[][] splitCsv(
case STATE_QUOTING:
String field = stringBuilder.toString();
fields.add(field);

if (maxFieldSize != null && fields.size() > maxFieldSize) {
fields = replaceLastField(fields, maxFieldSize, text, lastFieldStartIndex, text.length());
}

lines.add(fields.toArray(new String[0]));

String[][] result = new String[lines.size()][];
Expand All @@ -510,6 +553,28 @@ public static String[][] splitCsv(
}
}

/**
* if the max field size < the real field size,
* remove the extra fields and copy the latest field from lastFieldStartIndex to lastFieldEndIndex
*
* @param fields Target field list
* @param maxFieldSize Specified max fieldSize
* @param text Origin text
* @param lastFieldStartIndex Start index of last field
* @param lastFieldEndIndex End index of last field
*/
private static List<String> replaceLastField(
List<String> fields,
int maxFieldSize,
String text,
int lastFieldStartIndex,
int lastFieldEndIndex) {
List<String> newField = fields.subList(0, maxFieldSize - 1);
String last = text.substring(lastFieldStartIndex, lastFieldEndIndex);
newField.add(last);
return newField;
}

/**
* Concat the given fields.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,45 @@ public void testSplitCsvString() {
assertEquals("home", csv1Array2[2][1]);
assertEquals("home", csv1Array2[2][2]);
}

@Test
public void testSplitCsvStringWithMaxFields() {

String csvString = "name|age=20\\||&'\n\name|age=20\\||&'\n\n|home|\\home\\";
String[][] csv1Array0 = StringUtils.splitCsv(csvString, '|',
'\\', '\'', '\n', false, 0);
assertEquals(0, csv1Array0.length);

String[][] csv1Array1 = StringUtils.splitCsv(csvString, '|',
'\\', '\'', '\n', false, 1);
assertEquals("name|age=20\\||&'\n\name|age=20\\||&'", csv1Array1[0][0]);
assertEquals("", csv1Array1[1][0]);
assertEquals("|home|\\home\\", csv1Array1[2][0]);

String[][] csv1Array2 = StringUtils.splitCsv(csvString, '|',
'\\', '\'', '\n', false, 2);
assertEquals("name", csv1Array2[0][0]);
assertEquals("age=20\\||&'\n\name|age=20\\||&'", csv1Array2[0][1]);
assertEquals("", csv1Array2[1][0]);
assertEquals("", csv1Array2[2][0]);
assertEquals("home|\\home\\", csv1Array2[2][1]);

String[][] csv1Array3 = StringUtils.splitCsv(csvString, '|',
'\\', '\'', '\n', false, 3);
assertEquals("name", csv1Array3[0][0]);
assertEquals("age=20|", csv1Array3[0][1]);
assertEquals("&\n\name|age=20\\||&", csv1Array3[0][2]);
assertEquals("", csv1Array3[2][0]);
assertEquals("home", csv1Array3[2][1]);
assertEquals("home", csv1Array3[2][2]);

String[][] csv1Array4 = StringUtils.splitCsv(csvString, '|',
'\\', '\'', '\n', false, 4);
assertEquals("name", csv1Array4[0][0]);
assertEquals("age=20|", csv1Array4[0][1]);
assertEquals("&\n\name|age=20\\||&", csv1Array4[0][2]);
assertEquals("", csv1Array4[2][0]);
assertEquals("home", csv1Array4[2][1]);
assertEquals("home", csv1Array4[2][2]);
}
}
Loading