Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JNI bindings to write CSV #12425

Merged
merged 27 commits into from
Jan 5, 2023
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
21c3deb
JNI bindings to write CSV
mythrocks Dec 15, 2022
7fa0204
Support for chunked CSV writes in JNI:
mythrocks Dec 21, 2022
faa64c4
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Dec 21, 2022
2fa91e7
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Dec 21, 2022
e446ae3
Added tests header inclusion.
mythrocks Dec 21, 2022
54a5a87
Formatting.
mythrocks Dec 21, 2022
c8f74de
Support to specify TRUE/FALSE strings.
mythrocks Dec 21, 2022
ebbfcb8
Added tests for combinations of True/False reps, header inclusion, etc.
mythrocks Dec 21, 2022
cce5574
Removed JNI's non-chunked CSV writes to memory.
mythrocks Dec 21, 2022
7089163
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Dec 27, 2022
15693f1
Added newline at the end of the file, per CUDF guideline.
mythrocks Dec 27, 2022
15e84c5
Removed unnecessary whitespace at top of file.
mythrocks Dec 27, 2022
0da15a4
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Dec 28, 2022
e9107c9
Re-added whitespace at end of file.
mythrocks Dec 28, 2022
52f62e2
Fixed header order. Removed trailing newlines.
mythrocks Dec 30, 2022
af7eed3
Postpone setting _first_write till after write.
mythrocks Dec 30, 2022
5728549
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Dec 30, 2022
0d82984
Trailing newlines.
mythrocks Dec 30, 2022
fa24027
Review changes:
mythrocks Dec 30, 2022
f5e30c5
More formatting .
mythrocks Dec 30, 2022
d3642a4
Updated documentation for _inter_column_delimiter.
mythrocks Jan 3, 2023
c83e0d9
Updated copyright date.
mythrocks Jan 3, 2023
6dee89a
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Jan 3, 2023
e4fa895
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Jan 4, 2023
bfd2cd3
Review fixes:
mythrocks Jan 5, 2023
b57e8d9
Merge remote-tracking branch 'origin/branch-23.02' into hive-text-writer
mythrocks Jan 5, 2023
8d9b374
Merge branch 'branch-23.02' into hive-text-writer
mythrocks Jan 5, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion cpp/include/cudf/io/csv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1332,7 +1332,7 @@ class csv_writer_options {
size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
// character to use for separating lines (default "\n")
std::string _line_terminator = "\n";
// character to use for separating lines (default "\n")
// character to use for separating column values (default ",")
mythrocks marked this conversation as resolved.
Show resolved Hide resolved
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
mythrocks marked this conversation as resolved.
Show resolved Hide resolved
char _inter_column_delimiter = ',';
// string to use for values != 0 in INT8 types (default 'true')
std::string _true_value = std::string{"true"};
Expand Down Expand Up @@ -1498,6 +1498,13 @@ class csv_writer_options {
* @param val String to represent values == 0 in INT8 types
*/
void set_false_value(std::string val) { _false_value = val; }

/**
* @brief (Re)sets the table being written.
*
* @param table Table to be written
*/
void set_table(table_view const& table) { _table = table; }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good. As mentioned offline, we might want to look into separating the sink, input table and writer options to facilitate easier reuse of options. Not in scope for this PR.

};

/**
Expand Down
134 changes: 134 additions & 0 deletions java/src/main/java/ai/rapids/cudf/CSVWriterOptions.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
/*
*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package ai.rapids.cudf;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class CSVWriterOptions {
mythrocks marked this conversation as resolved.
Show resolved Hide resolved

private String[] columnNames;
private Boolean includeHeader = false;
private String rowDelimiter = "\n";
private byte fieldDelimiter = ',';
private String nullValue = "\\N";
mythrocks marked this conversation as resolved.
Show resolved Hide resolved
private String falseValue = "false";
private String trueValue = "true";

private CSVWriterOptions(Builder builder) {
this.columnNames = builder.columnNames.toArray(new String[builder.columnNames.size()]);
this.nullValue = builder.nullValue;
this.includeHeader = builder.includeHeader;
this.fieldDelimiter = builder.fieldDelimiter;
this.rowDelimiter = builder.rowDelimiter;
this.falseValue = builder.falseValue;
this.trueValue = builder.trueValue;
}

public String[] getColumnNames() {
return columnNames;
}

public Boolean getIncludeHeader() {
return includeHeader;
}

public String getRowDelimiter() {
return rowDelimiter;
}

public byte getFieldDelimiter() {
return fieldDelimiter;
}

public String getNullValue() {
return nullValue;
}

public String getTrueValue() {
return trueValue;
}

public String getFalseValue() {
return falseValue;
}

public static Builder builder() {
return new Builder();
}

public static class Builder {

private List<String> columnNames = Collections.emptyList();
private Boolean includeHeader = false;
private String rowDelimiter = "\n";
private byte fieldDelimiter = ',';
private String nullValue = "\\N";
mythrocks marked this conversation as resolved.
Show resolved Hide resolved
private String falseValue = "false";
private String trueValue = "true";

public CSVWriterOptions build() {
return new CSVWriterOptions(this);
}

public Builder withColumnNames(List<String> columnNames) {
this.columnNames = columnNames;
return this;
}

public Builder withColumnNames(String... columnNames) {
List<String> columnNamesList = new ArrayList<>();
for (String columnName : columnNames) {
columnNamesList.add(columnName);
}
return withColumnNames(columnNamesList);
}

public Builder withIncludeHeader(Boolean includeHeader) {
this.includeHeader = includeHeader;
return this;
}

public Builder withRowDelimiter(String rowDelimiter) {
this.rowDelimiter = rowDelimiter;
return this;
}

public Builder withFieldDelimiter(byte fieldDelimiter) {
this.fieldDelimiter = fieldDelimiter;
return this;
}

public Builder withNullValue(String nullValue) {
this.nullValue = nullValue;
return this;
}

public Builder withTrueValue(String trueValue) {
this.trueValue = trueValue;
return this;
}

public Builder withFalseValue(String falseValue) {
this.falseValue = falseValue;
return this;
}
}
}
76 changes: 76 additions & 0 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,82 @@ public static Table readCSV(Schema schema, CSVOptions opts, HostMemoryBuffer buf
opts.getFalseValues()));
}

private static native void writeCSVToFile(long table,
String[] columnNames,
boolean includeHeader,
String rowDelimiter,
byte fieldDelimiter,
String nullValue,
String trueValue,
String falseValue,
String outputPath) throws CudfException;

public void writeCSVToFile(CSVWriterOptions options, String outputPath) {
writeCSVToFile(nativeHandle,
options.getColumnNames(),
options.getIncludeHeader(),
options.getRowDelimiter(),
options.getFieldDelimiter(),
options.getNullValue(),
options.getTrueValue(),
options.getFalseValue(),
outputPath);
}

private static native long startWriteCSVToBuffer(String[] columnNames,
boolean includeHeader,
String rowDelimiter,
byte fieldDelimiter,
String nullValue,
String trueValue,
String falseValue,
HostBufferConsumer buffer) throws CudfException;

private static native void writeCSVChunkToBuffer(long writerHandle, long tableHandle);

private static native void endWriteCSVToBuffer(long writerHandle);

private static class CSVTableWriter implements TableWriter {
private long writerHandle;
private HostBufferConsumer consumer;

private CSVTableWriter(CSVWriterOptions options, HostBufferConsumer consumer) {
this.writerHandle = startWriteCSVToBuffer(options.getColumnNames(),
options.getIncludeHeader(),
options.getRowDelimiter(),
options.getFieldDelimiter(),
options.getNullValue(),
options.getTrueValue(),
options.getFalseValue(),
consumer);
this.consumer = consumer;
}

@Override
public void write(Table table) {
if (writerHandle == 0) {
throw new IllegalStateException("Writer was already closed");
}
writeCSVChunkToBuffer(writerHandle, table.nativeHandle);
}

@Override
public void close() throws CudfException {
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
if (writerHandle != 0) {
endWriteCSVToBuffer(writerHandle);
writerHandle = 0;
}
if (consumer != null) {
consumer.done();
consumer = null;
}
}
}

public static TableWriter getCSVBufferWriter(CSVWriterOptions options, HostBufferConsumer bufferConsumer) {
return new CSVTableWriter(options, bufferConsumer);
}

/**
* Read a JSON file using the default JSONOptions.
* @param schema the schema of the file. You may use Schema.INFERRED to infer the schema.
Expand Down
Loading