Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
lujop committed Apr 27, 2021
1 parent 2b1b920 commit bac2f13
Show file tree
Hide file tree
Showing 17 changed files with 790 additions and 106 deletions.
102 changes: 72 additions & 30 deletions core/src/main/java/tech/tablesaw/io/FileReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.collect.Streams;
import com.univocity.parsers.common.AbstractParser;
import java.io.Reader;
import java.util.ArrayList;
Expand All @@ -12,6 +13,7 @@
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.Random;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -25,42 +27,65 @@ public abstract class FileReader {
private static Logger logger = LoggerFactory.getLogger(FileReader.class);
private static final int UNLIMITED_SAMPLE_SIZE = -1;

/**
* @deprecated Use {@link #getColumnTypes(Reader, ReadOptions, int, AbstractParser, String[])} }
*/
@Deprecated
public ColumnType[] getColumnTypes(
Reader reader, ReadOptions options, int linesToSkip, AbstractParser<?> parser) {
return getColumnTypes(reader, options, linesToSkip, parser, null);
}
/**
* Returns an array containing the inferred columnTypes for the file being read, as calculated by
* the ColumnType inference logic. These types may not be correct.
*/
public ColumnType[] getColumnTypes(
Reader reader, ReadOptions options, int linesToSkip, AbstractParser<?> parser) {
Reader reader,
ReadOptions options,
int linesToSkip,
AbstractParser<?> parser,
String[] columnNames) {

parser.beginParsing(reader);
if (parser.getContext() == null) parser.beginParsing(reader);

for (int i = 0; i < linesToSkip; i++) {
parser.parseNext();
}

ColumnTypeDetector detector = new ColumnTypeDetector(options.columnTypesToDetect());

return detector.detectColumnTypes(
new Iterator<String[]>() {

String[] nextRow = parser.parseNext();

@Override
public boolean hasNext() {
return nextRow != null;
}

@Override
public String[] next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
String[] tmp = nextRow;
nextRow = parser.parseNext();
return tmp;
}
},
options);
ColumnType[] columnTypes =
detector.detectColumnTypes(
new Iterator<String[]>() {

String[] nextRow = parser.parseNext();

@Override
public boolean hasNext() {
return nextRow != null;
}

@Override
public String[] next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
String[] tmp = nextRow;
nextRow = parser.parseNext();
return tmp;
}
},
options);

// If there are columnTypes configured by the user use them
for (int i = 0; i < columnTypes.length; i++) {
boolean hasColumnName = columnNames != null && i < columnNames.length;
Optional<ColumnType> configuredColumnType =
options.columnTypeReadOptions().columnType(i, hasColumnName ? columnNames[i] : null);
if (configuredColumnType.isPresent()) columnTypes[i] = configuredColumnType.get();
}

return columnTypes;
}

private String cleanName(String name) {
Expand All @@ -69,7 +94,9 @@ private String cleanName(String name) {

/** Returns the column names for each column in the source. */
public String[] getColumnNames(
ReadOptions options, ColumnType[] types, AbstractParser<?> parser) {
ReadOptions options,
ReadOptions.ColumnTypeReadOptions columnTypeReadOptions,
AbstractParser<?> parser) {

if (options.header()) {

Expand All @@ -89,8 +116,12 @@ public String[] getColumnNames(
return headerNames;
} else {
// Placeholder column names for when the file read has no header
String[] headerNames = new String[types.length];
for (int i = 0; i < types.length; i++) {
int columnLength =
columnTypeReadOptions.columnTypes() != null
? columnTypeReadOptions.columnTypes().length
: 0;
String[] headerNames = new String[columnLength];
for (int i = 0; i < columnLength; i++) {
headerNames[i] = "C" + i;
}
return headerNames;
Expand Down Expand Up @@ -123,22 +154,33 @@ protected Table parseRows(
ReadOptions options,
boolean headerOnly,
Reader reader,
ColumnType[] types,
ReadOptions.ColumnTypeReadOptions columnTypeReadOptions,
AbstractParser<?> parser) {
return parseRows(options, headerOnly, reader, types, parser, UNLIMITED_SAMPLE_SIZE);
return parseRows(
options, headerOnly, reader, columnTypeReadOptions, parser, UNLIMITED_SAMPLE_SIZE);
}

protected Table parseRows(
ReadOptions options,
boolean headerOnly,
Reader reader,
ColumnType[] types,
ReadOptions.ColumnTypeReadOptions columnTypeReadOptions,
AbstractParser<?> parser,
int sampleSize) {
parser.beginParsing(reader);
Table table = Table.create(options.tableName());

List<String> headerRow = Lists.newArrayList(getColumnNames(options, types, parser));
List<String> headerRow =
Lists.newArrayList(getColumnNames(options, columnTypeReadOptions, parser));

@SuppressWarnings({"UnstableApiUsage", "OptionalGetWithoutIsPresent"})
ColumnType[] types =
Streams.mapWithIndex(
headerRow.stream(),
(columnName, idx) -> columnTypeReadOptions.columnType((int) idx, columnName))
.filter(Optional::isPresent)
.map(Optional::get)
.toArray(ColumnType[]::new);

for (int x = 0; x < types.length; x++) {
if (types[x] != SKIP) {
Expand Down
136 changes: 135 additions & 1 deletion core/src/main/java/tech/tablesaw/io/ReadOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,13 @@
import java.io.Reader;
import java.net.URL;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import tech.tablesaw.api.ColumnType;

public class ReadOptions {
Expand Down Expand Up @@ -85,6 +90,8 @@ public class ReadOptions {
protected final DateTimeFormatter dateTimeFormatter;
protected final DateTimeFormatter timeFormatter;

protected final ColumnTypeReadOptions columnTypeReadOptions;

protected final boolean header;

protected ReadOptions(ReadOptions.Builder builder) {
Expand All @@ -107,6 +114,13 @@ protected ReadOptions(ReadOptions.Builder builder) {

allowDuplicateColumnNames = builder.allowDuplicateColumnNames;

columnTypeReadOptions =
new ColumnTypeReadOptions(
builder.columnTypes,
builder.columnTypeMap,
builder.completeColumnTypeFunction,
builder.columnTypeFunction);

if (builder.locale == null) {
locale = Locale.getDefault();
} else {
Expand Down Expand Up @@ -185,6 +199,10 @@ public DateTimeFormatter dateFormatter() {
return DateTimeFormatter.ofPattern(dateFormat, locale);
}

public ColumnTypeReadOptions columnTypeReadOptions() {
return columnTypeReadOptions;
}

protected static class Builder {

protected final Source source;
Expand All @@ -204,6 +222,10 @@ protected static class Builder {
protected int maxCharsPerColumn = 4096;
protected boolean ignoreZeroDecimal = DEFAULT_IGNORE_ZERO_DECIMAL;
private boolean allowDuplicateColumnNames = false;
protected ColumnType[] columnTypes;
protected Map<String, ColumnType> columnTypeMap = new HashMap<>();
protected Function<String, Optional<ColumnType>> columnTypeFunction;
protected Function<String, ColumnType> completeColumnTypeFunction;

protected Builder() {
source = null;
Expand Down Expand Up @@ -314,7 +336,15 @@ public Builder locale(Locale locale) {

/** @see ColumnTypeDetector */
public Builder columnTypesToDetect(List<ColumnType> columnTypesToDetect) {
this.columnTypesToDetect = columnTypesToDetect;
// Types need to be in certain order as more general types like string come last
// Otherwise everything will be parsed as a string
List<ColumnType> orderedTypes = new ArrayList<>();
for (ColumnType t : EXTENDED_TYPES) {
if (columnTypesToDetect.contains(t)) {
orderedTypes.add(t);
}
}
this.columnTypesToDetect = orderedTypes;
return this;
}

Expand All @@ -327,8 +357,112 @@ public Builder minimizeColumnSizes() {
return this;
}

/**
* Provide column types for all columns preventing autodetect column type logic. It's expected
* that the array contains all columns
*/
public Builder columnTypes(ColumnType[] columnTypes) {
this.columnTypes = columnTypes;
return this;
}

public Builder columnType(String columnName, ColumnType columnType) {
this.columnTypeMap.put(columnName, columnType);
return this;
}

/**
* Provide a function that determines ColumnType for some column names. To provide for all
* column names use {@link #completeColumnTypeByNameFunction(Function)} because it prevents
* running unnecessary autodetect column type logic that can be expensive in some situations
*/
public Builder columnTypeByNameFunction(
Function<String, Optional<ColumnType>> columnTypeFunction) {
this.columnTypeFunction = columnTypeFunction;
return this;
}

/**
* Provide a function that determines ColumnType for all column names. To provide only for some
* use {@link #columnTypeByNameFunction(Function)}
*
* <p>Providing that function prevents running autodetect column type logic
*/
public Builder completeColumnTypeByNameFunction(
Function<String, ColumnType> columnTypeFunction) {
this.completeColumnTypeFunction = columnTypeFunction;
return this;
}

public Builder columnTypes(Map<String, ColumnType> columnTypeByName) {
if (columnTypeByName != null) this.columnTypeMap = columnTypeByName;

return this;
}

public ReadOptions build() {
return new ReadOptions(this);
}
}

/**
* Allow to customize read column types. It can work in three ways:
*
* <ul>
* <li>If no information is provided column types are autodetected
* <li>A complete list of columns can be provided using {@link
* ReadOptions.Builder#columnTypes(ColumnType[])} or {@link
* ReadOptions.Builder#completeColumnTypeFunction} and they are used preventing autodetect
* <li>Provide values for some column names using {@link
* ReadOptions.Builder#columnType(String,ColumnType)} or {@link
* ReadOptions.Builder#columnTypeByNameFunction(Function)} (String, ColumnType)}. In this
* case provided columnTypes are used and the others are autodetected
* </ul>
*/
public static class ColumnTypeReadOptions {
final ColumnType[] columnTypesByIdx;
final Map<String, ColumnType> columnTypesByNameMap;
final Function<String, Optional<ColumnType>> columnTypesByNameFunction;
final Function<String, ColumnType> completeColumnTypesByNameFunction;

public static ColumnTypeReadOptions of(ColumnType[] allColumnTypes) {
return new ColumnTypeReadOptions(allColumnTypes, null, null, null);
}

ColumnTypeReadOptions(
ColumnType[] columnTypesByIdx,
Map<String, ColumnType> columnTypesByNameMap,
Function<String, ColumnType> completeColumnTypesByNameFunction,
Function<String, Optional<ColumnType>> columnTypesByNameFunction) {
this.columnTypesByIdx = columnTypesByIdx;
this.columnTypesByNameMap = columnTypesByNameMap;
this.columnTypesByNameFunction = columnTypesByNameFunction;
this.completeColumnTypesByNameFunction = completeColumnTypesByNameFunction;
}

public Optional<ColumnType> columnType(int columnNumber, String columnName) {
Optional<ColumnType> columnType = Optional.empty();
if (columnTypesByIdx != null && columnNumber < columnTypesByIdx.length)
columnType = Optional.ofNullable(columnTypesByIdx[columnNumber]);
if (!columnType.isPresent() && columnTypesByNameMap != null)
columnType = Optional.ofNullable(columnTypesByNameMap.get(columnName));
if (!columnType.isPresent() && completeColumnTypesByNameFunction != null)
columnType = Optional.of(completeColumnTypesByNameFunction.apply(columnName));
if (!columnType.isPresent() && columnTypesByNameFunction != null)
columnType = columnTypesByNameFunction.apply(columnName);
return columnType;
}

public ColumnType[] columnTypes() {
return columnTypesByIdx;
}

public boolean canCalculateColumnTypeForAllColumns() {
return hasColumnTypeForAllColumns() || completeColumnTypesByNameFunction != null;
}

public boolean hasColumnTypeForAllColumns() {
return columnTypesByIdx != null && columnTypesByIdx.length > 0;
}
}
}
10 changes: 10 additions & 0 deletions core/src/main/java/tech/tablesaw/io/TableBuildingUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import tech.tablesaw.api.ColumnType;
import tech.tablesaw.api.Table;

Expand All @@ -18,6 +19,15 @@ public static Table build(
ColumnTypeDetector detector = new ColumnTypeDetector(options.columnTypesToDetect());
Iterator<String[]> iterator = dataRows.iterator();
ColumnType[] types = detector.detectColumnTypes(iterator, options);

// If there are columnTypes configured by the user use them
for (int i = 0; i < types.length; i++) {
boolean hasColumnName = i < columnNames.size();
Optional<ColumnType> configuredColumnType =
options.columnTypeReadOptions().columnType(i, hasColumnName ? columnNames.get(i) : null);
if (configuredColumnType.isPresent()) types[i] = configuredColumnType.get();
}

for (int i = 0; i < columnNames.size(); i++) {
table.addColumns(types[i].create(columnNames.get(i)));
}
Expand Down
Loading

0 comments on commit bac2f13

Please sign in to comment.