From bac2f134803a20db1a275b244120096904a96666 Mon Sep 17 00:00:00 2001 From: Joan Pujol Date: Mon, 26 Apr 2021 19:41:45 +0200 Subject: [PATCH] Fix #822 and #815 --- .../java/tech/tablesaw/io/FileReader.java | 102 +++++++++---- .../java/tech/tablesaw/io/ReadOptions.java | 136 +++++++++++++++++- .../tech/tablesaw/io/TableBuildingUtils.java | 10 ++ .../tech/tablesaw/io/csv/CsvReadOptions.java | 51 ++++--- .../java/tech/tablesaw/io/csv/CsvReader.java | 40 ++++-- .../io/fixed/FixedWidthReadOptions.java | 31 +++- .../tablesaw/io/fixed/FixedWidthReader.java | 40 ++++-- .../tech/tablesaw/io/csv/CsvReaderTest.java | 101 +++++++++++++ .../io/fixed/FixedWidthReaderTest.java | 57 +++++++- data/bus_stop_test_no_data.csv | 1 + data/columns.xlsx | Bin 8041 -> 11293 bytes .../tablesaw/io/xlsx/XlsxReadOptions.java | 30 ++++ .../tech/tablesaw/io/xlsx/XlsxReader.java | 88 +++++++++--- .../tech/tablesaw/io/xlsx/XlsxReaderTest.java | 115 +++++++++++++-- .../tablesaw/io/html/HtmlReadOptions.java | 30 ++++ .../tablesaw/io/json/JsonReadOptions.java | 30 ++++ .../tech/tablesaw/io/json/JsonReaderTest.java | 34 +++++ 17 files changed, 790 insertions(+), 106 deletions(-) create mode 100644 data/bus_stop_test_no_data.csv diff --git a/core/src/main/java/tech/tablesaw/io/FileReader.java b/core/src/main/java/tech/tablesaw/io/FileReader.java index d84ee9562..a84451840 100644 --- a/core/src/main/java/tech/tablesaw/io/FileReader.java +++ b/core/src/main/java/tech/tablesaw/io/FileReader.java @@ -4,6 +4,7 @@ import com.google.common.base.Strings; import com.google.common.collect.Lists; +import com.google.common.collect.Streams; import com.univocity.parsers.common.AbstractParser; import java.io.Reader; import java.util.ArrayList; @@ -12,6 +13,7 @@ import java.util.List; import java.util.Map; import java.util.NoSuchElementException; +import java.util.Optional; import java.util.Random; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,14 +27,26 @@ public abstract class FileReader { private static Logger logger = LoggerFactory.getLogger(FileReader.class); private static final int UNLIMITED_SAMPLE_SIZE = -1; + /** + * @deprecated Use {@link #getColumnTypes(Reader, ReadOptions, int, AbstractParser, String[])} } + */ + @Deprecated + public ColumnType[] getColumnTypes( + Reader reader, ReadOptions options, int linesToSkip, AbstractParser parser) { + return getColumnTypes(reader, options, linesToSkip, parser, null); + } /** * Returns an array containing the inferred columnTypes for the file being read, as calculated by * the ColumnType inference logic. These types may not be correct. */ public ColumnType[] getColumnTypes( - Reader reader, ReadOptions options, int linesToSkip, AbstractParser parser) { + Reader reader, + ReadOptions options, + int linesToSkip, + AbstractParser parser, + String[] columnNames) { - parser.beginParsing(reader); + if (parser.getContext() == null) parser.beginParsing(reader); for (int i = 0; i < linesToSkip; i++) { parser.parseNext(); @@ -40,27 +54,38 @@ public ColumnType[] getColumnTypes( ColumnTypeDetector detector = new ColumnTypeDetector(options.columnTypesToDetect()); - return detector.detectColumnTypes( - new Iterator() { - - String[] nextRow = parser.parseNext(); - - @Override - public boolean hasNext() { - return nextRow != null; - } - - @Override - public String[] next() { - if (!hasNext()) { - throw new NoSuchElementException(); - } - String[] tmp = nextRow; - nextRow = parser.parseNext(); - return tmp; - } - }, - options); + ColumnType[] columnTypes = + detector.detectColumnTypes( + new Iterator() { + + String[] nextRow = parser.parseNext(); + + @Override + public boolean hasNext() { + return nextRow != null; + } + + @Override + public String[] next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + String[] tmp = nextRow; + nextRow = parser.parseNext(); + return tmp; + } + }, + options); + + // If there are columnTypes configured by the user use them + for (int i = 0; i < columnTypes.length; i++) { + boolean hasColumnName = columnNames != null && i < columnNames.length; + Optional configuredColumnType = + options.columnTypeReadOptions().columnType(i, hasColumnName ? columnNames[i] : null); + if (configuredColumnType.isPresent()) columnTypes[i] = configuredColumnType.get(); + } + + return columnTypes; } private String cleanName(String name) { @@ -69,7 +94,9 @@ private String cleanName(String name) { /** Returns the column names for each column in the source. */ public String[] getColumnNames( - ReadOptions options, ColumnType[] types, AbstractParser parser) { + ReadOptions options, + ReadOptions.ColumnTypeReadOptions columnTypeReadOptions, + AbstractParser parser) { if (options.header()) { @@ -89,8 +116,12 @@ public String[] getColumnNames( return headerNames; } else { // Placeholder column names for when the file read has no header - String[] headerNames = new String[types.length]; - for (int i = 0; i < types.length; i++) { + int columnLength = + columnTypeReadOptions.columnTypes() != null + ? columnTypeReadOptions.columnTypes().length + : 0; + String[] headerNames = new String[columnLength]; + for (int i = 0; i < columnLength; i++) { headerNames[i] = "C" + i; } return headerNames; @@ -123,22 +154,33 @@ protected Table parseRows( ReadOptions options, boolean headerOnly, Reader reader, - ColumnType[] types, + ReadOptions.ColumnTypeReadOptions columnTypeReadOptions, AbstractParser parser) { - return parseRows(options, headerOnly, reader, types, parser, UNLIMITED_SAMPLE_SIZE); + return parseRows( + options, headerOnly, reader, columnTypeReadOptions, parser, UNLIMITED_SAMPLE_SIZE); } protected Table parseRows( ReadOptions options, boolean headerOnly, Reader reader, - ColumnType[] types, + ReadOptions.ColumnTypeReadOptions columnTypeReadOptions, AbstractParser parser, int sampleSize) { parser.beginParsing(reader); Table table = Table.create(options.tableName()); - List headerRow = Lists.newArrayList(getColumnNames(options, types, parser)); + List headerRow = + Lists.newArrayList(getColumnNames(options, columnTypeReadOptions, parser)); + + @SuppressWarnings({"UnstableApiUsage", "OptionalGetWithoutIsPresent"}) + ColumnType[] types = + Streams.mapWithIndex( + headerRow.stream(), + (columnName, idx) -> columnTypeReadOptions.columnType((int) idx, columnName)) + .filter(Optional::isPresent) + .map(Optional::get) + .toArray(ColumnType[]::new); for (int x = 0; x < types.length; x++) { if (types[x] != SKIP) { diff --git a/core/src/main/java/tech/tablesaw/io/ReadOptions.java b/core/src/main/java/tech/tablesaw/io/ReadOptions.java index 761d310cd..b2bb12420 100644 --- a/core/src/main/java/tech/tablesaw/io/ReadOptions.java +++ b/core/src/main/java/tech/tablesaw/io/ReadOptions.java @@ -35,8 +35,13 @@ import java.io.Reader; import java.net.URL; import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; import tech.tablesaw.api.ColumnType; public class ReadOptions { @@ -85,6 +90,8 @@ public class ReadOptions { protected final DateTimeFormatter dateTimeFormatter; protected final DateTimeFormatter timeFormatter; + protected final ColumnTypeReadOptions columnTypeReadOptions; + protected final boolean header; protected ReadOptions(ReadOptions.Builder builder) { @@ -107,6 +114,13 @@ protected ReadOptions(ReadOptions.Builder builder) { allowDuplicateColumnNames = builder.allowDuplicateColumnNames; + columnTypeReadOptions = + new ColumnTypeReadOptions( + builder.columnTypes, + builder.columnTypeMap, + builder.completeColumnTypeFunction, + builder.columnTypeFunction); + if (builder.locale == null) { locale = Locale.getDefault(); } else { @@ -185,6 +199,10 @@ public DateTimeFormatter dateFormatter() { return DateTimeFormatter.ofPattern(dateFormat, locale); } + public ColumnTypeReadOptions columnTypeReadOptions() { + return columnTypeReadOptions; + } + protected static class Builder { protected final Source source; @@ -204,6 +222,10 @@ protected static class Builder { protected int maxCharsPerColumn = 4096; protected boolean ignoreZeroDecimal = DEFAULT_IGNORE_ZERO_DECIMAL; private boolean allowDuplicateColumnNames = false; + protected ColumnType[] columnTypes; + protected Map columnTypeMap = new HashMap<>(); + protected Function> columnTypeFunction; + protected Function completeColumnTypeFunction; protected Builder() { source = null; @@ -314,7 +336,15 @@ public Builder locale(Locale locale) { /** @see ColumnTypeDetector */ public Builder columnTypesToDetect(List columnTypesToDetect) { - this.columnTypesToDetect = columnTypesToDetect; + // Types need to be in certain order as more general types like string come last + // Otherwise everything will be parsed as a string + List orderedTypes = new ArrayList<>(); + for (ColumnType t : EXTENDED_TYPES) { + if (columnTypesToDetect.contains(t)) { + orderedTypes.add(t); + } + } + this.columnTypesToDetect = orderedTypes; return this; } @@ -327,8 +357,112 @@ public Builder minimizeColumnSizes() { return this; } + /** + * Provide column types for all columns preventing autodetect column type logic. It's expected + * that the array contains all columns + */ + public Builder columnTypes(ColumnType[] columnTypes) { + this.columnTypes = columnTypes; + return this; + } + + public Builder columnType(String columnName, ColumnType columnType) { + this.columnTypeMap.put(columnName, columnType); + return this; + } + + /** + * Provide a function that determines ColumnType for some column names. To provide for all + * column names use {@link #completeColumnTypeByNameFunction(Function)} because it prevents + * running unnecessary autodetect column type logic that can be expensive in some situations + */ + public Builder columnTypeByNameFunction( + Function> columnTypeFunction) { + this.columnTypeFunction = columnTypeFunction; + return this; + } + + /** + * Provide a function that determines ColumnType for all column names. To provide only for some + * use {@link #columnTypeByNameFunction(Function)} + * + *

Providing that function prevents running autodetect column type logic + */ + public Builder completeColumnTypeByNameFunction( + Function columnTypeFunction) { + this.completeColumnTypeFunction = columnTypeFunction; + return this; + } + + public Builder columnTypes(Map columnTypeByName) { + if (columnTypeByName != null) this.columnTypeMap = columnTypeByName; + + return this; + } + public ReadOptions build() { return new ReadOptions(this); } } + + /** + * Allow to customize read column types. It can work in three ways: + * + *

    + *
  • If no information is provided column types are autodetected + *
  • A complete list of columns can be provided using {@link + * ReadOptions.Builder#columnTypes(ColumnType[])} or {@link + * ReadOptions.Builder#completeColumnTypeFunction} and they are used preventing autodetect + *
  • Provide values for some column names using {@link + * ReadOptions.Builder#columnType(String,ColumnType)} or {@link + * ReadOptions.Builder#columnTypeByNameFunction(Function)} (String, ColumnType)}. In this + * case provided columnTypes are used and the others are autodetected + *
+ */ + public static class ColumnTypeReadOptions { + final ColumnType[] columnTypesByIdx; + final Map columnTypesByNameMap; + final Function> columnTypesByNameFunction; + final Function completeColumnTypesByNameFunction; + + public static ColumnTypeReadOptions of(ColumnType[] allColumnTypes) { + return new ColumnTypeReadOptions(allColumnTypes, null, null, null); + } + + ColumnTypeReadOptions( + ColumnType[] columnTypesByIdx, + Map columnTypesByNameMap, + Function completeColumnTypesByNameFunction, + Function> columnTypesByNameFunction) { + this.columnTypesByIdx = columnTypesByIdx; + this.columnTypesByNameMap = columnTypesByNameMap; + this.columnTypesByNameFunction = columnTypesByNameFunction; + this.completeColumnTypesByNameFunction = completeColumnTypesByNameFunction; + } + + public Optional columnType(int columnNumber, String columnName) { + Optional columnType = Optional.empty(); + if (columnTypesByIdx != null && columnNumber < columnTypesByIdx.length) + columnType = Optional.ofNullable(columnTypesByIdx[columnNumber]); + if (!columnType.isPresent() && columnTypesByNameMap != null) + columnType = Optional.ofNullable(columnTypesByNameMap.get(columnName)); + if (!columnType.isPresent() && completeColumnTypesByNameFunction != null) + columnType = Optional.of(completeColumnTypesByNameFunction.apply(columnName)); + if (!columnType.isPresent() && columnTypesByNameFunction != null) + columnType = columnTypesByNameFunction.apply(columnName); + return columnType; + } + + public ColumnType[] columnTypes() { + return columnTypesByIdx; + } + + public boolean canCalculateColumnTypeForAllColumns() { + return hasColumnTypeForAllColumns() || completeColumnTypesByNameFunction != null; + } + + public boolean hasColumnTypeForAllColumns() { + return columnTypesByIdx != null && columnTypesByIdx.length > 0; + } + } } diff --git a/core/src/main/java/tech/tablesaw/io/TableBuildingUtils.java b/core/src/main/java/tech/tablesaw/io/TableBuildingUtils.java index 6bb500ac8..47584f753 100644 --- a/core/src/main/java/tech/tablesaw/io/TableBuildingUtils.java +++ b/core/src/main/java/tech/tablesaw/io/TableBuildingUtils.java @@ -2,6 +2,7 @@ import java.util.Iterator; import java.util.List; +import java.util.Optional; import tech.tablesaw.api.ColumnType; import tech.tablesaw.api.Table; @@ -18,6 +19,15 @@ public static Table build( ColumnTypeDetector detector = new ColumnTypeDetector(options.columnTypesToDetect()); Iterator iterator = dataRows.iterator(); ColumnType[] types = detector.detectColumnTypes(iterator, options); + + // If there are columnTypes configured by the user use them + for (int i = 0; i < types.length; i++) { + boolean hasColumnName = i < columnNames.size(); + Optional configuredColumnType = + options.columnTypeReadOptions().columnType(i, hasColumnName ? columnNames.get(i) : null); + if (configuredColumnType.isPresent()) types[i] = configuredColumnType.get(); + } + for (int i = 0; i < columnNames.size(); i++) { table.addColumns(types[i].create(columnNames.get(i))); } diff --git a/core/src/main/java/tech/tablesaw/io/csv/CsvReadOptions.java b/core/src/main/java/tech/tablesaw/io/csv/CsvReadOptions.java index 0e6ce8cb2..f1683ab4b 100644 --- a/core/src/main/java/tech/tablesaw/io/csv/CsvReadOptions.java +++ b/core/src/main/java/tech/tablesaw/io/csv/CsvReadOptions.java @@ -22,16 +22,16 @@ import java.io.StringReader; import java.net.URL; import java.time.format.DateTimeFormatter; -import java.util.ArrayList; import java.util.List; import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; import tech.tablesaw.api.ColumnType; import tech.tablesaw.io.ReadOptions; import tech.tablesaw.io.Source; public class CsvReadOptions extends ReadOptions { - - private final ColumnType[] columnTypes; private final Character separator; private final Character quoteChar; private final Character escapeChar; @@ -43,7 +43,6 @@ public class CsvReadOptions extends ReadOptions { private CsvReadOptions(CsvReadOptions.Builder builder) { super(builder); - columnTypes = builder.columnTypes; separator = builder.separator; quoteChar = builder.quoteChar; escapeChar = builder.escapeChar; @@ -115,8 +114,10 @@ public static Builder builder(InputStreamReader reader) { return new Builder(reader); } + /** @deprecated Use {@link #columnTypeReadOptions()} */ + @Deprecated public ColumnType[] columnTypes() { - return columnTypes; + return columnTypeReadOptions.columnTypes(); } public Character separator() { @@ -161,7 +162,6 @@ public static class Builder extends ReadOptions.Builder { private Character quoteChar; private Character escapeChar; private String lineEnding; - private ColumnType[] columnTypes; private Integer maxNumberOfColumns = 10_000; private Character commentPrefix; private boolean lineSeparatorDetectionEnabled = true; @@ -191,8 +191,9 @@ protected Builder(InputStream stream) { super(stream); } + @Override public Builder columnTypes(ColumnType[] columnTypes) { - this.columnTypes = columnTypes; + super.columnTypes(columnTypes); return this; } @@ -272,15 +273,7 @@ public Builder allowDuplicateColumnNames(Boolean allow) { @Override public Builder columnTypesToDetect(List columnTypesToDetect) { - // Types need to be in certain order as more general types like string come last - // Otherwise everything will be parsed as a string - List orderedTypes = new ArrayList<>(); - for (ColumnType t : EXTENDED_TYPES) { - if (columnTypesToDetect.contains(t)) { - orderedTypes.add(t); - } - } - this.columnTypesToDetect = orderedTypes; + super.columnTypesToDetect(columnTypesToDetect); return this; } @@ -364,5 +357,31 @@ public Builder ignoreZeroDecimal(boolean ignoreZeroDecimal) { super.ignoreZeroDecimal(ignoreZeroDecimal); return this; } + + @Override + public Builder columnType(String columnName, ColumnType columnType) { + super.columnType(columnName, columnType); + return this; + } + + @Override + public Builder columnTypeByNameFunction( + Function> columnTypeFunction) { + super.columnTypeByNameFunction(columnTypeFunction); + return this; + } + + @Override + public Builder completeColumnTypeByNameFunction( + Function columnTypeFunction) { + super.completeColumnTypeByNameFunction(columnTypeFunction); + return this; + } + + @Override + public Builder columnTypes(Map columnTypeByName) { + super.columnTypes(columnTypeByName); + return this; + } } } diff --git a/core/src/main/java/tech/tablesaw/io/csv/CsvReader.java b/core/src/main/java/tech/tablesaw/io/csv/CsvReader.java index 7d47c533f..3b44591e2 100644 --- a/core/src/main/java/tech/tablesaw/io/csv/CsvReader.java +++ b/core/src/main/java/tech/tablesaw/io/csv/CsvReader.java @@ -27,6 +27,7 @@ import tech.tablesaw.api.Table; import tech.tablesaw.io.DataReader; import tech.tablesaw.io.FileReader; +import tech.tablesaw.io.ReadOptions; import tech.tablesaw.io.ReaderRegistry; import tech.tablesaw.io.Source; @@ -54,23 +55,31 @@ public CsvReader() { * Determines column types if not provided by the user Reads all input into memory unless File was * provided */ - private Pair getReaderAndColumnTypes(Source source, CsvReadOptions options) - throws IOException { - ColumnType[] types = options.columnTypes(); + private Pair getReaderAndColumnTypes( + Source source, CsvReadOptions options) throws IOException { + ReadOptions.ColumnTypeReadOptions columnTypeReadOptions = options.columnTypeReadOptions(); byte[] bytesCache = null; - if (types == null) { - Reader reader = source.createReader(bytesCache); + boolean need2ParseFile = + !columnTypeReadOptions.hasColumnTypeForAllColumns() + && (!options.header() || !columnTypeReadOptions.canCalculateColumnTypeForAllColumns()); + if (need2ParseFile) { + Reader reader = source.createReader(null); if (source.file() == null) { String s = CharStreams.toString(reader); bytesCache = source.getCharset() != null ? s.getBytes(source.getCharset()) : s.getBytes(); // create a new reader since we just exhausted the existing one reader = source.createReader(bytesCache); } - types = detectColumnTypes(reader, options); + ColumnType[] detectedColumnTypes = detectColumnTypes(reader, options); + // If no columns where returned from detectColumnTypes leave initial options (that's the case + // for only header present) + if (detectedColumnTypes.length > 0) { + columnTypeReadOptions = ReadOptions.ColumnTypeReadOptions.of(detectedColumnTypes); + } } - return Pair.create(source.createReader(bytesCache), types); + return Pair.create(source.createReader(bytesCache), columnTypeReadOptions); } public Table read(CsvReadOptions options) throws IOException { @@ -78,14 +87,16 @@ public Table read(CsvReadOptions options) throws IOException { } private Table read(CsvReadOptions options, boolean headerOnly) throws IOException { - Pair pair = getReaderAndColumnTypes(options.source(), options); + Pair pair = + getReaderAndColumnTypes(options.source(), options); Reader reader = pair.getKey(); - ColumnType[] types = pair.getValue(); + ReadOptions.ColumnTypeReadOptions columnTypeReadOptions = pair.getValue(); AbstractParser parser = csvParser(options); try { - return parseRows(options, headerOnly, reader, types, parser, options.sampleSize()); + return parseRows( + options, headerOnly, reader, columnTypeReadOptions, parser, options.sampleSize()); } finally { if (options.source().reader() == null) { // if we get a reader back from options it means the client opened it, so let the client @@ -133,12 +144,15 @@ public String printColumnTypes(CsvReadOptions options) throws IOException { */ protected ColumnType[] detectColumnTypes(Reader reader, CsvReadOptions options) { boolean header = options.header(); - int linesToSkip = header ? 1 : 0; - CsvParser parser = csvParser(options); try { - return getColumnTypes(reader, options, linesToSkip, parser); + String[] columnNames = null; + if (header) { + parser.beginParsing(reader); + columnNames = getColumnNames(options, options.columnTypeReadOptions(), parser); + } + return getColumnTypes(reader, options, 0, parser, columnNames); } finally { parser.stopParsing(); // we don't close the reader since we didn't create it diff --git a/core/src/main/java/tech/tablesaw/io/fixed/FixedWidthReadOptions.java b/core/src/main/java/tech/tablesaw/io/fixed/FixedWidthReadOptions.java index c05b575a4..eb7ad1a9f 100644 --- a/core/src/main/java/tech/tablesaw/io/fixed/FixedWidthReadOptions.java +++ b/core/src/main/java/tech/tablesaw/io/fixed/FixedWidthReadOptions.java @@ -22,6 +22,9 @@ import java.net.URL; import java.time.format.DateTimeFormatter; import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; import tech.tablesaw.api.ColumnType; import tech.tablesaw.io.ReadOptions; import tech.tablesaw.io.Source; @@ -199,7 +202,7 @@ public Builder skipInvalidRows(boolean skipInvalidRows) { } public Builder columnTypes(ColumnType[] columnTypes) { - this.columnTypes = columnTypes; + super.columnTypes(columnTypes); return this; } @@ -300,5 +303,31 @@ public Builder ignoreZeroDecimal(boolean ignoreZeroDecimal) { super.ignoreZeroDecimal(ignoreZeroDecimal); return this; } + + @Override + public Builder columnType(String columnName, ColumnType columnType) { + super.columnType(columnName, columnType); + return this; + } + + @Override + public Builder columnTypeByNameFunction( + Function> columnTypeFunction) { + super.columnTypeByNameFunction(columnTypeFunction); + return this; + } + + @Override + public Builder completeColumnTypeByNameFunction( + Function columnTypeFunction) { + super.completeColumnTypeByNameFunction(columnTypeFunction); + return this; + } + + @Override + public Builder columnTypes(Map columnTypeByName) { + super.columnTypes(columnTypeByName); + return this; + } } } diff --git a/core/src/main/java/tech/tablesaw/io/fixed/FixedWidthReader.java b/core/src/main/java/tech/tablesaw/io/fixed/FixedWidthReader.java index 6ff6a490b..916f38eb7 100644 --- a/core/src/main/java/tech/tablesaw/io/fixed/FixedWidthReader.java +++ b/core/src/main/java/tech/tablesaw/io/fixed/FixedWidthReader.java @@ -16,17 +16,21 @@ import com.google.common.io.CharStreams; import com.univocity.parsers.common.AbstractParser; +import com.univocity.parsers.common.NormalizedString; import com.univocity.parsers.fixed.FixedWidthFormat; import com.univocity.parsers.fixed.FixedWidthParser; import com.univocity.parsers.fixed.FixedWidthParserSettings; import java.io.IOException; import java.io.Reader; +import java.util.Arrays; +import java.util.Optional; import javax.annotation.concurrent.Immutable; import org.apache.commons.math3.util.Pair; import tech.tablesaw.api.ColumnType; import tech.tablesaw.api.Table; import tech.tablesaw.io.DataReader; import tech.tablesaw.io.FileReader; +import tech.tablesaw.io.ReadOptions; import tech.tablesaw.io.ReaderRegistry; import tech.tablesaw.io.Source; @@ -52,22 +56,29 @@ public FixedWidthReader() { * Determines column types if not provided by the user Reads all input into memory unless File was * provided */ - private Pair getReaderAndColumnTypes(FixedWidthReadOptions options) - throws IOException { - ColumnType[] types = options.columnTypes(); + private Pair getReaderAndColumnTypes( + FixedWidthReadOptions options) throws IOException { + ReadOptions.ColumnTypeReadOptions columnTypeReadOptions = options.columnTypeReadOptions(); byte[] bytesCache = null; - if (types == null) { + boolean hasColumnNames = + options.columnSpecs() != null + && options.columnSpecs().getFieldNames() != null + && options.columnSpecs().getFieldNames().length > 0; + if (!options.columnTypeReadOptions().hasColumnTypeForAllColumns() + && (!options.columnTypeReadOptions().canCalculateColumnTypeForAllColumns() + || !hasColumnNames)) { Reader reader = options.source().createReader(bytesCache); if (options.source().file() == null) { bytesCache = CharStreams.toString(reader).getBytes(); // create a new reader since we just exhausted the existing one reader = options.source().createReader(bytesCache); } - types = detectColumnTypes(reader, options); + columnTypeReadOptions = + ReadOptions.ColumnTypeReadOptions.of(detectColumnTypes(reader, options)); } - return Pair.create(options.source().createReader(bytesCache), types); + return Pair.create(options.source().createReader(bytesCache), columnTypeReadOptions); } public Table read(FixedWidthReadOptions options) throws IOException { @@ -75,14 +86,14 @@ public Table read(FixedWidthReadOptions options) throws IOException { } private Table read(FixedWidthReadOptions options, boolean headerOnly) throws IOException { - Pair pair = getReaderAndColumnTypes(options); + Pair pair = getReaderAndColumnTypes(options); Reader reader = pair.getKey(); - ColumnType[] types = pair.getValue(); + ReadOptions.ColumnTypeReadOptions columnTypeReadOptions = pair.getValue(); FixedWidthParser parser = fixedWidthParser(options); try { - return parseRows(options, headerOnly, reader, types, parser); + return parseRows(options, headerOnly, reader, columnTypeReadOptions, parser); } finally { if (options.source().reader() == null) { // if we get a reader back from options it means the client opened it, so let the client @@ -138,7 +149,16 @@ public ColumnType[] detectColumnTypes(Reader reader, FixedWidthReadOptions optio AbstractParser parser = fixedWidthParser(options); try { - return getColumnTypes(reader, options, linesToSkip, parser); + String[] columnNames = + Optional.ofNullable(options.columnSpecs()) + .flatMap(specs -> Optional.ofNullable(specs.getFieldNames())) + .map( + fieldNames -> + Arrays.stream(fieldNames) + .map(NormalizedString::toString) + .toArray(String[]::new)) + .orElse(null); + return getColumnTypes(reader, options, linesToSkip, parser, columnNames); } finally { parser.stopParsing(); // we don't close the reader since we didn't create it diff --git a/core/src/test/java/tech/tablesaw/io/csv/CsvReaderTest.java b/core/src/test/java/tech/tablesaw/io/csv/CsvReaderTest.java index c126144c0..390cf8f04 100644 --- a/core/src/test/java/tech/tablesaw/io/csv/CsvReaderTest.java +++ b/core/src/test/java/tech/tablesaw/io/csv/CsvReaderTest.java @@ -20,6 +20,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; import static tech.tablesaw.api.ColumnType.*; import com.univocity.parsers.common.TextParsingException; @@ -40,6 +41,7 @@ import java.util.Collections; import java.util.List; import java.util.Locale; +import java.util.Optional; import java.util.Set; import java.util.TreeSet; import org.junit.jupiter.api.Test; @@ -826,4 +828,103 @@ public void preserveQuote() throws IOException { Table out = Table.read().csv(new StringReader(string)); assertEquals(table.get(0, 0), out.get(0, 0)); } + + @Test + public void testCustomizedColumnTypesMixedWithDetection() throws IOException { + Reader reader = new FileReader("../data/bus_stop_test.csv"); + CsvReadOptions options = + CsvReadOptions.builder(reader) + .header(true) + .separator(',') + .locale(Locale.getDefault()) + .minimizeColumnSizes() + .columnType("stop_id", STRING) + .columnType("stop_name", STRING) + .columnTypeByNameFunction( + columnName -> + "stop_lon".equals(columnName) ? Optional.of(DOUBLE) : Optional.empty()) + .build(); + + ColumnType[] columnTypes = new CsvReader().read(options).columnTypes(); + + ColumnType[] expectedTypes = Arrays.copyOf(bus_types, bus_types.length); + expectedTypes[0] = STRING; // stop_id + expectedTypes[1] = STRING; // stop_name + expectedTypes[4] = DOUBLE; // stop_lon + assertArrayEquals(expectedTypes, columnTypes); + } + + @Test + public void testCustomizedColumnTypeAllCustomized() throws IOException { + Reader reader = new FileReader("../data/bus_stop_test.csv"); + CsvReadOptions options = + CsvReadOptions.builder(reader) + .header(true) + .separator(',') + .locale(Locale.getDefault()) + .minimizeColumnSizes() + .completeColumnTypeByNameFunction(columnName -> STRING) + .build(); + + ColumnType[] columnTypes = new CsvReader().read(options).columnTypes(); + + assertTrue(Arrays.stream(columnTypes).allMatch(columnType -> columnType.equals(STRING))); + } + + @Test + public void testColumnsArePreservedWithNoDataIfCustomizedTypesAreProvided() throws IOException { + Reader reader = new FileReader("../data/bus_stop_test_no_data.csv"); + CsvReadOptions options = + CsvReadOptions.builder(reader) + .header(true) + .separator(',') + .locale(Locale.getDefault()) + .minimizeColumnSizes() + .columnType("stop_id", SHORT) + .columnType("stop_name", STRING) + .columnType("stop_desc", STRING) + .columnType("stop_lat", FLOAT) + .columnType("stop_lon", FLOAT) + .build(); + + ColumnType[] columnTypes = new CsvReader().read(options).columnTypes(); + + assertArrayEquals(bus_types, columnTypes); + } + + @Test + public void testColumnsArePreservedWithNoDataIfCustomizedTypesAreProvided2() throws IOException { + Reader reader = new FileReader("../data/bus_stop_test_no_data.csv"); + CsvReadOptions options = + CsvReadOptions.builder(reader) + .header(true) + .separator(',') + .locale(Locale.getDefault()) + .minimizeColumnSizes() + .columnTypes(new ColumnType[] {SHORT, STRING, STRING, FLOAT, FLOAT}) + .build(); + + ColumnType[] columnTypes = new CsvReader().read(options).columnTypes(); + + assertArrayEquals(bus_types, columnTypes); + } + + @Test + public void testColumnsArePreservedWithNoDataIfCustomizedTypesAreProvidedPartially() + throws IOException { + Reader reader = new FileReader("../data/bus_stop_test_no_data.csv"); + CsvReadOptions options = + CsvReadOptions.builder(reader) + .header(true) + .separator(',') + .locale(Locale.getDefault()) + .minimizeColumnSizes() + .columnType("stop_id", SHORT) + .columnType("stop_name", STRING) + .build(); + + ColumnType[] columnTypes = new CsvReader().read(options).columnTypes(); + + assertArrayEquals(new ColumnType[] {SHORT, STRING}, columnTypes); + } } diff --git a/core/src/test/java/tech/tablesaw/io/fixed/FixedWidthReaderTest.java b/core/src/test/java/tech/tablesaw/io/fixed/FixedWidthReaderTest.java index a723d2190..2c8a623c6 100644 --- a/core/src/test/java/tech/tablesaw/io/fixed/FixedWidthReaderTest.java +++ b/core/src/test/java/tech/tablesaw/io/fixed/FixedWidthReaderTest.java @@ -16,6 +16,7 @@ import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; import static tech.tablesaw.api.ColumnType.FLOAT; import static tech.tablesaw.api.ColumnType.SHORT; import static tech.tablesaw.api.ColumnType.SKIP; @@ -25,8 +26,10 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileReader; +import java.io.IOException; import java.io.InputStream; import java.io.Reader; +import java.util.Arrays; import java.util.Locale; import org.junit.jupiter.api.Test; import tech.tablesaw.api.ColumnType; @@ -53,13 +56,13 @@ public void testWithCarsData() throws Exception { .systemLineEnding() .build()); - String[] expected = new String[]{"Year", "Make", "Model", "Description", "Price"}; + String[] expected = new String[] {"Year", "Make", "Model", "Description", "Price"}; assertArrayEquals(expected, table.columnNames().toArray()); table = table.sortDescendingOn("Year"); table.removeColumns("Description"); - expected = new String[]{"Year", "Make", "Model", "Price"}; + expected = new String[] {"Year", "Make", "Model", "Price"}; assertArrayEquals(expected, table.columnNames().toArray()); } @@ -79,7 +82,7 @@ public void testWithColumnSKIP() throws Exception { assertEquals(4, table.columnCount()); - String[] expected = new String[]{"Year", "Make", "Model", "Price"}; + String[] expected = new String[] {"Year", "Make", "Model", "Price"}; assertArrayEquals(expected, table.columnNames().toArray()); } @@ -100,7 +103,7 @@ public void testWithColumnSKIPWithoutHeader() throws Exception { assertEquals(4, table.columnCount()); - String[] expected = new String[]{"C0", "C1", "C2", "C4"}; + String[] expected = new String[] {"C0", "C1", "C2", "C4"}; assertArrayEquals(expected, table.columnNames().toArray()); } @@ -162,13 +165,55 @@ public void testWithSkipTrailingCharsUntilNewline() throws Exception { .skipTrailingCharsUntilNewline(true) .build()); - String[] expected = new String[]{"Year", "Make", "Model", "Description", "Price"}; + String[] expected = new String[] {"Year", "Make", "Model", "Description", "Price"}; assertArrayEquals(expected, table.columnNames().toArray()); table = table.sortDescendingOn("Year"); table.removeColumns("Price"); - expected = new String[]{"Year", "Make", "Model", "Description"}; + expected = new String[] {"Year", "Make", "Model", "Description"}; assertArrayEquals(expected, table.columnNames().toArray()); } + + @Test + public void testCustomizedColumnTypesMixedWithDetection() throws Exception { + InputStream stream = new FileInputStream(new File("../data/fixed_width_cars_test.txt")); + FixedWidthReadOptions options = + FixedWidthReadOptions.builder(stream) + .header(true) + .columnSpecs(car_fields_specs) + .padding('_') + .systemLineEnding() + .sample(false) + .locale(Locale.getDefault()) + .minimizeColumnSizes() + .columnType("Year", STRING) + .build(); + + ColumnType[] columnTypes = new FixedWidthReader().read(options).columnTypes(); + + ColumnType[] expectedTypes = Arrays.copyOf(car_types, car_types.length); + car_types[0] = STRING; // Year + assertArrayEquals(expectedTypes, columnTypes); + } + + @Test + public void testCustomizedColumnTypeAllCustomized() throws IOException { + InputStream stream = new FileInputStream(new File("../data/fixed_width_cars_test.txt")); + FixedWidthReadOptions options = + FixedWidthReadOptions.builder(stream) + .header(true) + .columnSpecs(car_fields_specs) + .padding('_') + .systemLineEnding() + .sample(false) + .locale(Locale.getDefault()) + .minimizeColumnSizes() + .completeColumnTypeByNameFunction(columnName -> STRING) + .build(); + + ColumnType[] columnTypes = new FixedWidthReader().read(options).columnTypes(); + + assertTrue(Arrays.stream(columnTypes).allMatch(columnType -> columnType.equals(STRING))); + } } diff --git a/data/bus_stop_test_no_data.csv b/data/bus_stop_test_no_data.csv new file mode 100644 index 000000000..db9997f80 --- /dev/null +++ b/data/bus_stop_test_no_data.csv @@ -0,0 +1 @@ +stop_id,stop_name,stop_desc,stop_lat,stop_lon diff --git a/data/columns.xlsx b/data/columns.xlsx index fb5f6919784edf40ff69f5f2f3be77008b51da81..a12f4f062d17842721d1cdfbf3e0251e444e355e 100644 GIT binary patch literal 11293 zcmeG?by!u)w}2pB2auAKICO`kfFPaHAYF$zl%&J~q$H#f6p%)`rAu16TR<8K51sGm zy*{s8-}~MlufFg8_V?|Xwb#sAd-m)VGqaYm+$}_WxZ7|jaBy%`a4F0-U~70dI2
Y0(tpC5RfA9}e?^I#OiUF1vJQpN_=?5%VM7Wvy917; zv}yb3=XD)DEym_~bV7E@UZL|^L? zu1N|2_0_FVd%6`Q2!MmgIB3?slZn)YlJ_buS4Ean{i1TrgxIA$8);_Sa5sEzDGl`E zM$1Q1bu_@=t=*QX7^t}eKdOttP3(W`VsvNmWowta0t=@-u43)lMIM;2#XS=hYtP4b z7uUAjNl6?hJ#gfu3(t9XRQB}}o`h!;P?>CAXwG7U4JKaK*SFx5f6>uOb@qD)uoiik z3Zlbw)WFfq#)*yfTlp{D{0A%KUp~DcT0!X#-Mk3uIv$&Qi6buODkfb=_1N22W){0L zJcEXGu8ICG&SMh)TaSDiyf1oY=7hpGyQq$qxC>qY@CENxxfTW{oH;y2W~6hBmvSgr z{DA8+b~twQ>XEEF(4`@gG5>w;Te+|Y}m}W*|52S9j%SQVC!#AEc3C7(hNI}cWTA8;i)iqpcbbuUlS#UZ*YdUBPlPoq&^FL3)L^2hYV~C+{14J*}%7aP=v)FXk3QQ=)%06hk-J=@ft6X6b)}+RIq9A!_FDU_sI_Sc4=*d_E^WD@Z@N8xlBu*mq@R`ZRZN_wDd&s_U1Wgq^BqW8&CKZ zeq#C-8W)=b<{8uz_}z&okjejuvARM z5Un1B;)v)u8?f>VAh(iBweUmnGj!U)gv(hv;(IY?!XGOv$L=0AaWGDOq!c~H&Zuhj z+9JQY^*gbRMvmP^gbB1Ltd#@~1s*20f0*~b3hxh74-Z?0!TSEQuY%}a&Zl& z{zHupoL@&< zn988}6}Xt0SR?JW_x9ZjZnl)Oc|(hI3u$s~LbFa26t$dVm?RRMYwVBEHVe+em{ty= zauA;4&wIW5(qzMWDnrCr!7(Nqdw76{Qgj4ANEIR)TTL1IHnRWPy6zo_TUhXNC~5`$ z4(Hxr2hu}GBb~;cE`1d4XS4IZq@%k7;xXr1J^Ov}L}fh0jju&-OMJlV%p>^BuN_ch zfG1^=pNhfj;&;C?p4G7$1jO)QT$t?qI8e?3zHaF%pIj4#{rbO~J_e-7F=3d^6G4H8 zBZBq#)ATu6nwdE}v3;*NzS+QnIbEfD-NYUH=hCNG@!*X1q7lyr&5VK=Q`)qu73Il# zeA)2UHc6AhK?Ct{%EVQrVeyxPTy1Ds!%_&N%VY~LQ%ve-*s!I>dT|b*10q1 zXEVpgoiW`WE;FpjhbpcM^J9;S>c+}8wj%SByU)DGcr}w1d)sLgb2znmp>_LJt^1q0 zAm@n*sJ7>P-#pt)owu&(7;92v(e;^&chkJOZhvFeY?*e0r_c3Vf$sbSAOYH5e{HMg zETq#Pu`oOPG=J4mrTSd0tIy`ZO8q+W&|sRy-08ZUmSwG6y?UqUJVVF5On+@Hc^5C0 zyS!6&p!zzwCqPKMl47+D&YP9MXMyqgS1l%Q)_89V4tHkW?V_>$^V6}76VZ{N>&Etq}_72YH zhFN)EU7drRrwV2?=ay2T`Bzn}Wg@Oc(_5>T={}AY{GluRSE+T8qE*o3nZ~N3L zbGuVFr8yaio`_;^LGVO5$!=}G#BQy4xt&VFlctDx=%TyM^3*c{vI()X_qr!TCM|Pk zLe8@;c@iZXs$h!R^(gNKxjCj})-XR^pX>&oD!r-az>lUTiLETwGVb5p^z zmFXN|-WIs0z8)!{ucQD^ZLLZk0C+)8zLcuO?GcS|6m7;(Ha z+RF_^O5NBNBQf1UXF|!s6>f+1l9OPEt9r><@cxud+tH9yCnKN-MR*HgJ_KCr->e*~%*o~ZBc1aIzJ3opFpezGE}KYBRL${Cl#RKq7E z5t%@o@^<)Y@1$jKTgN77&9L~*u4q!=;(g%W8M?y99OQxW7Y!O4K(f!*Y%`sySOe{; zFsbJgLH55~t~#`PCX|_R#`fH*1aLtqntUnx`V&@%g&^Hwn3l<_gAI}|;gZ5|O_Ohn zQ%s%|jPHTYvaog!KOPuY2CDV-5_opUXt{NlYkkToC25$;OQMKO#b16DN#Fj2^Xlzt zYCRe%4>Z@O@4g4YLj%b%?u?ZQxQr{F@?4*AhE0Q*kh2N)^ec8B+*P*L3);jcL}-HH z72uDqC59%sg%b!T2E6$Gwfv3IfKE$hRZ5L*VO&HLG??}ig6T1)6YH{-gL1GgE zRMYSZ?W4hWBm$J4-RE}&8Z?L9=g)$V66y3(%;*Y#Rq0zXV0eY``@-nz?&`pAisD8q zxGQ)^G(4hgc0VIDE?5;{gr)|o0*z#^v4pdHf>VZRR0pvkbPj{~2c0Kk%3APA9~Q&8 zY%nbWrNM>(RPK3zTqaEw0E|Nf(L#)6(sTg81jJ2Rh@VWFF#z}q5kv<` zmq`NwfXRrPbdXk=G)DmN4I+phvLut{0RW~VZqh>#WYhcrz_*AXAcR^rEffIEK->gE z#AVZ>0l+Lo5CgXOg60u04zXcWP%LK zrd0ueMTqa1AYWzE8Uer(L`LSrHoU#=hv8Y?@q~@t2_$lv9e}*jhal#tU%(F(Nk1Am z2CFaBOhO=y8>zo^`Oyv|giY6|7WEYz8c)1*IIXJ5x2Io0(a_($7Ssg#bf}o`rq-w;j}@S^*?4lud9! z%;Mqyr%2Jx|BJE6hD+v^0(tIab)=7K#ge>GHBkXE|KpIxqgr7kFH}r8K+IC%?YX5y zJa;lW{&`5n@b`w0!W?*~Foxxci`N2+tLR)+pL*Rp(eOtT`V-)$H#XLd zw&%yk8Rx5w`VDWc>x9Y9-9`&nzvwRs#3D`WQokdZ!&L4;swIf z7b#XDI)G4sw1CxBW82$1!_iyjx0Yfa3S&{MVJ-|3YWi@l&Z$w2yk@&>(Ox(PVt-{{ z*&n_>bb?x~daB}*1LB=(4=C~uidjXy-&3N;4TgEJr}iRgpM|Klr%%MdW8k^R6UMex z0||*$_86C*0K0B|=+gpE?kaf@%hc>TM2k(@iswaNnGqau`A_P}2;;Dqj)_^3lFnmt zi-)>Zo43}{+iV=~uN||pd6=poB!ZTCPnaBNDeFm7LWAJ4r{DX#p-_s!)mat`Pk+3f zpX_LhZc08=I&taX^LeT8WR0aCItJoN5!}Z$U5eB>(e~l2B^Pi*B))k6oT9L3Vk^Y| z(aRPkos&Li5#ky*L|Wz**EhrqHa+V+G(A%ZBYU@zHYegT5`rCzBF0pt?3_yBeQ$H7 ztM24=`bp9FPV3>t?>u~W`Vxt@vk|fE9DUEe)q=riq%vu*#If8;UToF;lP7)_e7hQ9 zR+#PBy$l0dICD_f zonWu*Ky>Lx{y~cPXsTL>+roo=FlZV`83O89)O);4nZ&u?U2!)6addGA%a}Vh)HmfE zA24pr8Un=vjDYqKQD>*}OA=j65;ykZ9>;|~W%Si4M7zrpX*og>83ys-qzntxpd}=h(cD7rohojM$0Vm<;(anE7#XP1hZlo~I_o||m>w)VT+J*^}buZGK75~4@gVs`#iM6GqSe8{@);Li`7 zHBCt*xs|K7$5D{ z5Ff>J=sK7vJM-ne@Ma|n6F}7kf950RU4@Elf3W$;E&>UA<>_$JT1xgxFgbmym>N?-x4di4qE2b~7bM z+m54IN1=oWOn5Q(-19vjTF|A;oEps=jcwjmnPb`~`gARRj;+^NhXj2O**ZvJuqqnz ziKVpP;3E7w&1=88V({A&H0HFlVO*Fx+rWb2n186Vle4>xnbWspvw97M$Pt(#-`l5P zUMN>AQ?SaX3AW<{0195ZMDg1@;N?{AJuEqgIi6V8#oad)c!34wc<|=XKPHMy6!y1Oz&z7`w09*`R9!s#`~Xb9Tf|D3q~% zd*e0cV!J#-&t|)yk==IKni>eQ1zE+U#X%R?v=7(6z5ngELo0icn=) zNHerz+)7BQbcY9|G z(mRRw;sGc{F&Si~i9~}Nc$6L{;k514Bw(-QO?Qh6ypU~B!}=_gq}=qi7Gc@hPo=mg zu82}eT{4r#(@iA(iwo6CtB(BFSPV;d9qXqE;Z)wKQ| zNZJ|3Xy_NXKi3Fvh(s`tw{av~=4rAn5*jmvoavO9V{Oa0o?D96z@UVbPxb+PV!pMY z$hTKF-Y+(`V%5GO#l-mRE4Bf~E%X{w@!7MYb7j$HyRE3&!6p$8wh9Vy%C?fuEqZdF zuyK<}Lp$r|8i?C2#N*BFNuRd|$V09H_KhU9cpGy4F{+MbCD7tz>+oy)v#0tWTA20v{km!8{M^g*kh(vEYIF#o>{!K(;YEQy?}=)Y zF*XH~mFhw(HejOs26LhUTTGgCx*N|M z=HeJ?Vh5!ZH%I_Y=vjVOjMAM5IlA`(mQR%}ZIGf2&yp#-#84g2l3)5D|L*h|$sWso z4yH5NFh`01r|wu9IhvWOIXha}S$tO^o0lz$w>U5!?n+-s#0vJs-x3EVlQUAW&VITz z8@?eWyG?tfaj+{xi>TT?G(h5MezB8p>eR{@VpTEIXNWocfn&&_Vi!E%8aW<}AlW6) zKg|_7_k4zwEj`;xMoKWBRjI$5+XQ}U&u^5MLJ!@NbZe+v6*v83uufSWMIf>!WfTuy zapsrZ@uaxnwa{||ochd9Ji+7U=1bdeZtt@0S#UUwVJVGf^m*?}e?EL!>a^1Q8drzl z`bugr25cH;VK>en(_vy{ zV5+=$5)!1s7pHGQb_81yQgP zlI!kBpU?}yWuPCxX;7T%c=DA<#nlEk#??_X{?P^$ZZr;8Ma&b9<1;qptkOyVm@7rB zF4aSB8M|vm^AZ+1{d;ubJX5iZ74~mWVK*);LS+gzQFa8|JF%I79nHRd5iAGcUx6vu zpSFpTvuj$k|9+VSyyOEz8enjqOisxnM5m>Sn01Mpv%lJd?#G-Rl|a+m9+bxRNaFdJnI~s##&Y`dagxT+(X)g4`&Gn~>vdI!{RnHiNyYuy(xvA6r-nn; zJOod(bu;GZJ0Gc8^5}0?B1IlgO^$v-t@N;7rxmA|M$@Ct5Pafnd7n;fM>FD!Z0PQT z$>rEG|Id{69h;pU?aJrz{IAR|;At4~j|RZtV-Zl?nv^XtM>o5&ju&b~Nx0_U(9#w2 z+-W%6B=Vr}iP+M$??BF{TW!I@O9iA4$p>D4#pD~Hzg)RoZ&mUD@$JJC8Qc>BZ$7<+ zn*aKYVM$Gb&B>$tnC=)efJLL2>l*FSUo^65`Xku9y_33WB{OGi=9=eck`_Dy3(V#H zK9%yvANx`NCb3dk?pFrC=6U@f{8nee)Z#B$UpEPF=Dz$S?Y{l*;ou*cFgJ;R%|Q4` z3J2$l`knYc$Va%z=Vlc8C#MhS|2~Mn#iVa?xw*0a$%PN*Zefw_o7?Q03~nyve=@jD z@twiXh5b#^n=6^0q~g?nkltL^+~jc6`Tyjga_>8bUzP$lsekpfKWX9MsA=Hf{^oaY llK*N$|4JT2_ZRX%Evm8{5==I}B^08=HNx_yWEj3Z{TovrwC(@^ literal 8041 zcmd^E2UJsCwx$z`5NV2vfb<>+O(Gpcfgl0`DiFX>rGxY$(yKryLX1=ar3pchA_Nen z1`z2#MG=tR1ED_9nUV3$|Ie&>GqYx`x099JyVpJ6Ip@23@3Z$g+8V?p3bLT9*GJ{FC7Kmno z%i(Z;AVwCH1!S9aPjy6KL(zO2lI81=W_@>5^gauZ};P++gS>thS% zRM!*$X+H1Xq;r3nYamV|o?JhG4g^<7P;M!rF6>CN2Q!)PkZxH{6uuxYzA3A{NBBPA z5jc`4Ofs4O%{<@GHKqM46w(LhHb+}@>e~8XEu#z{2w844305h4w}h2)o75pJu6is5 zR$3FY1h%)lk=~vhE#+^Oe9bFhS86g=)PcTK3Dedf2aucF1d$RF5RelS5NQ8BDm@%s z?z>uBSh%^0eE<7F<>-jR-4D!U-8|nQfLftT&RB40PFBq1dxa=6Xs-#F&VoK(B`}9} zwl^iqT9v%%_ELy_{GJugS}wBRWYA}N`vz-?Fzr~E!F==D4xElA4-tgl99r{r-k*lA z_12sC_41IwacOV{*AmGA_}>^H53P0gijj^KS=UoB2}`-yqg4EX53jK<$)@kPC<9NZ_igoAvg2Jisj_ymu#IY?i8n-K{lbM`s(r)0od4S)~@Lq z!c8^eY8k|7<4*nh+8Vs&$W*60gS?3F86?bOvU+)XzxC_r+)i`dW5=a1=aK1V+_uOV z%-Em*;A~mU?hrgSd)0b_RT+QiP1&0beCC#9)iu)?CIP8go^7kHx&c3Ax-2{P0N*sX zv2BU7Jec%d@9m^5$nKrY-ktG)?fHlvzVct$Y*-xGn3-Bj86hv}y20Js>X#k%y2L&3 z;FXNDPvw|kNnN_gSna4?zy)sdnf0iln)c4h_Vz#X*zHF_*XphxEP_@+ z!6FrWyOL9tl2cou`djvMen@ACJk0hZ4m#`@*1Dy;h3EDg&l#7B=pDBa!$IF0bhtk8 zr<|#D%wtWi_UZj?s@kWf_Nl#P-iLyzu%N!;Rk?!-)SEoVjO^X&@CUO|4@P>%nf5L2 zz8+)o*+8g>svC5W8zV^I@V>I(H9?WmD_A z^kbGA5o0E@a>HicG9mbe%ER@Fxi1^A!@b#>$?T}EoSl7R=j!%LN!_$Q$kH@KH%fX)>O~lWBafXc|J{L zksJQCCMjKNPEw0%9!)m1AJ(m_1^hF_3+D}`UM=BzKT&RSh<~%P-d+p)7St#bV0wjj zZw0${a|?6(p)nhXnj|?8BMuxSBbrn&WdU>$2l}yrfFw+h zFe<=701=0xDHouFBoM_0VjxL=gwX>IQWD)$G!+7LkOsbG1Dz#d3c|pFgVaRXil!2P z4zj>eHqZr<nY7FS0 z3iM+Kfk>D_G4FtbXNfqJO)UT&)PX4WGO&t!XbToN#C!>&47UnjppO4L%KaVUdVB%;_Hb4jVVfOb1amxB6$5 zd$Hv8Qw_^r+mXnN(j4`R4a+_&=UMl#8b`9F^Fcbbbqlr61V7N}EE>}W9D zJ+uQGH^ls}SNg?}!r%=Cw^uBf>daB3*cO93E6j`WTu2D zn_2NLP=}y0Ux69iL;sUX-&<<1{+@vc8zC%y79UCshvUI(%`AQo)L>)r|H0vuf%V)& z+pyY0%={2#3#LgU=e}~eKm16u_qesF@5vQ*R#kl4d0e8eiw68qHxb~xe^%$)e zsAEu>rC@RQ(D&HxLFRK1<$G5D^-A9v_3`SDTg4ll0)AWS<8=wBOuuHbb#*W;?sx zPUZfPbVn1cB)WGJ+4seJ{V}rB-0SUFog3H(e7I}3Y3ck{ZriJBsNzbhrpLaKU`uj7cA>`oL9&3xSF&Jf(B9e zO-py5ZA5FQ%j?^nOvn+knS!CP=1pW!UsN-`6~4K%=kS#^xd7&#Ex*rtT#D|2hTBzX z2nY;{{-_k?_)*oLC`Dl|7Ir5KQsLwKm@W;D$sg5q+CYS^*7=1oO`m#XgS5i<}+iWWh9d^|DEin`QoftHl(<~DJ zefv^N{9c=s^HX0qbLq{-Ckf+Jr~9svadt4aEWo*k4LkKc%-(K0?Yv@sddO}JcT$Sq2p$mw)2{I)5SHZMNZaa2%x1~j zrIp(agG)<0BTGr)#)tbLce}kl{rmiVB7yByDNR)vt^uh4Fy(=%@HK3IcU2iXMzDjU zF+pMvfA7$lA}}soHGG=7PArI^au!hO8P-nuk5dtG~zZfI)ZBPY#xy-%v@l!S!Jsb4&= zi6#>{U^(S)8hM9*1nT$$B-RNft3BFS8xWJRE-U0c66KrZMV+tv2=Xs{!R-_{OR!aY zq({x@uZF%cxpGRuEMH&r>Bxu(YSB1GLgAL{Ss$z2ih<_yt5(FmuSOmAGlJ4zdpbUZ z4ZAM)FHXE=TBgN2I(pQ2kuO(nXLENdFkn4mAvhPmy_fX zy1Tr}^{#aW1Sq4|vdnRhNn>y3J3&-h;}I@I<&D<1oD7p3`Xatp2a(vm(8>dUlEHx{GDcn9KV?J$+xFa}23b8202 z;WBQQ6ym9pq@)pz zSoC(vJGTmPi?Kf+WUwTl`lQg2TV$cj!(9V3(S~vRsxP zLrX1dM9b!>OL3rzyD7n?r*s)U*WnwhXdbQV)ntn^wO6*hmZ&|?JCDNz$+&`HArv=D zLrbea>463Gt~AgAfY49MT#2u{4_(m=MlBG|kCeG#FE_1J%xQRA+b&t=1ePTuCxsWb zG=yJ`r@QFOw@Sx4^YRGV8^qjFys8GH5QgBd$IIpP5KDigKa5S(K4|14jSi2CNjv@d z{j|fa1!IHFn-{zmXFpI2>F60Stc#UQRF8Ix_GhNz3@l=9F47n~5OI69>6WBqX)dKtJf9_;s|$ z^QFxL3`z(YaO`Q6)20%9N1I+%M-k`uJ&oaqr@302xmdt;-CT}05|8%}?c)@+h()PY zZ(vxm7HmR=fl8@vfx*>gPwCP+(D*fx4=(i)# zem0>C^eC%c+>Y=_I>NV`^hq%&b zC_6^>w#a<0ntk!n8ls58+i&7=gU^O}{ltBDNE-Hj4|Foe0>sN6#!?UX+zkzM34SV~JsEI7Br(hCw*UO|`C~T9-)n}s)$wIy!PwkbWDtjWqWmz!E|L~2abL;K%QRs))!_+|Mo~--jS8BlLNr>xpZvOF z@;Y83K2hMp>Hbl3)pqcOR*meU;?crS3E!%xxgcuQ>{7wWJe*!M>r^rcX)_g7M1!F+ zCQc2;tcITjZx(etN9TVka_LSnV2u2bN0~Sv5PfL^Rh%KUwFO7B!#?kZUcA_k?DVc_ z=G168Eu>*vMApp}8!1>S-%Z8%@J;Z6vciiH$(DsEh8g@7Uz4JIc+YAqrODZg7!H-a zm_QnJ%V!(sML(|;z5(j$XSC3Li@bMriY;T(Z$OHm(3I|y=88|#-Ie_YKk;(bkYHl5 zGsAiypGirzBoEnhnf%=E`nyJSWt|so8T|I{VnA;hB!Qc&q5;^tV2ZoagpGY)~4Y@mj18O=Gltr(f9)Q zjJZmg=Eu*1G5OY2PDZLD@JEE`xzNus{hv!9yA;Rv6w?omW+WmYI3)ce@1p%-PseMy zxufI#?;G<88b=%Rz|b4r5S6i@dny9q*9P^Hj=sYeF@yBxpVi<<@!#qYaVlAMS;c!# z9p#1&9mdb0#3zDWBzf#o6G#f3bcS@b>gvas&L;%76n!1O-O|cPG<4=h4eLyo+6L5u8S&Api9>ilB6OHIC6g%pYVWPII_;(;E-9ty8IaL!e!>zh zBkDoVAkXeAS$_F?ELYQPSe>p?EI$&M)R6|qbt`CFlWr&>A?zH}XPK9}QoI9O&I!e; z7kDnd<<>+XqAG_lhMMP$p`rWe_up1s+WXgQSueV3RC?fqd=6M6o+p;d=Su|mM&a)m zX>x*T!TT-1c=O1Su)OBtnJ}}3kIbS^VkqbG138n=8~M9iQKLs<$Rm2VBf1#0_7saL zbiRehYx7gBvCI=KQ5CC>hsHvqzHjYA+p>z(j{KSX54ZNu{dfmIlBkn!;(HoZl%Qza z1Oloay^ipqfii@Sdc;#GFMK4I9Qbiitdxr#I=q8JNlwTBmCl$%95(MAc3Y$O&v~x~ z@_94}+$CKOOWEZCrj}P$(_UnZeqwfa+}@;g;2n7AfhZNt>g_;|$Sxhr zNvG~Az-ifWarRSbhPnDz?PIG|tN7d1Qm2`lb;M6ntK5FcA@GIN9O+z}sDN0$5hYXa5L6F=zsQ>Lj%^9Tea z?LNvhEpa+dX3?3iM7ixwUCgLZW>?6wDQH|=)QO==LEzW@a|=_v%xO2w*aR|KTuiMQ zXB*BD4&Z8K38-$~5#lN1HEkw%@S0CVy7h^JlKdMCoq5nhxvL&uujF5}Vit3;?#<

z)@|21vWh*T1NJpW35UyU!D%?mB;Z;it>FUnUZ059=)gGo`W?=fY=NIlYAZfK!1J&3 z)=}tYGWygB0a3mWSqdeOB>itkJZ@h9rp1uhaQ{`qJuciebGe!{Vch8#6Yv{J)-JJI zItjMjgh}jKRyj1SiqjT%+5ons8i`Vqt3`SZRvTW4cWFeCa^|P*BcsCn>z@~syt9sZ z)KSNYi1=dj!PF6L+%EsszE(~YEOVBRet3ymg8Sp<3xe!V{IzSHoY|`|y3FDjC6-Ua z&g$1i*{yAXX!Mk1&Mm2BiE*~6GftZP9{T=4oHZam+N5XWh}ocMB()F4DVNz zlV!+p!Sz%6k7D*;qx{YC>sP>&+19a9{gh$gcfen3nO^};CMd^A*-ue9x;+}v|4Pw* zML4-69VaqBMThYR!e7#wUz?tcq{nO4Psuv-7drmDg8ka~WKcN{wm*fN`FrEv3BAAK vob=e^5cE?Fk6=#(q+c7JG`hd$UitjrbZ>19QnF)o%A-%(Q7EjsaD4YauI~;1 diff --git a/excel/src/main/java/tech/tablesaw/io/xlsx/XlsxReadOptions.java b/excel/src/main/java/tech/tablesaw/io/xlsx/XlsxReadOptions.java index 3097d7a41..1f97471e7 100644 --- a/excel/src/main/java/tech/tablesaw/io/xlsx/XlsxReadOptions.java +++ b/excel/src/main/java/tech/tablesaw/io/xlsx/XlsxReadOptions.java @@ -7,6 +7,10 @@ import java.net.URL; import java.time.format.DateTimeFormatter; import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; +import tech.tablesaw.api.ColumnType; import tech.tablesaw.io.ReadOptions; import tech.tablesaw.io.Source; @@ -158,5 +162,31 @@ public Builder sheetIndex(int sheetIndex) { this.sheetIndex = sheetIndex; return this; } + + @Override + public Builder columnType(String columnName, ColumnType columnType) { + super.columnType(columnName, columnType); + return this; + } + + @Override + public Builder columnTypeByNameFunction( + Function> columnTypeFunction) { + super.columnTypeByNameFunction(columnTypeFunction); + return this; + } + + @Override + public Builder completeColumnTypeByNameFunction( + Function columnTypeFunction) { + super.completeColumnTypeByNameFunction(columnTypeFunction); + return this; + } + + @Override + public Builder columnTypes(Map columnTypeByName) { + super.columnTypes(columnTypeByName); + return this; + } } } diff --git a/excel/src/main/java/tech/tablesaw/io/xlsx/XlsxReader.java b/excel/src/main/java/tech/tablesaw/io/xlsx/XlsxReader.java index e841a40e0..211ad5712 100644 --- a/excel/src/main/java/tech/tablesaw/io/xlsx/XlsxReader.java +++ b/excel/src/main/java/tech/tablesaw/io/xlsx/XlsxReader.java @@ -14,6 +14,19 @@ package tech.tablesaw.io.xlsx; +import static org.apache.poi.ss.usermodel.CellType.FORMULA; + +import java.io.*; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.List; +import javax.annotation.concurrent.Immutable; +import org.apache.poi.ss.format.CellDateFormatter; +import org.apache.poi.ss.format.CellGeneralFormatter; +import org.apache.poi.ss.format.CellNumberFormatter; import org.apache.poi.ss.usermodel.*; import org.apache.poi.ss.usermodel.Row.MissingCellPolicy; import org.apache.poi.xssf.usermodel.XSSFWorkbook; @@ -26,17 +39,6 @@ import tech.tablesaw.io.ReaderRegistry; import tech.tablesaw.io.Source; -import javax.annotation.concurrent.Immutable; -import java.io.*; -import java.time.LocalDateTime; -import java.time.ZoneId; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Date; -import java.util.List; - -import static org.apache.poi.ss.usermodel.CellType.FORMULA; - @Immutable public class XlsxReader implements DataReader { @@ -140,8 +142,9 @@ private Boolean isBlank(Cell cell) { return null; } - private ColumnType getColumnType(Cell cell) { - CellType cellType = cell.getCellType() == FORMULA ? cell.getCachedFormulaResultType() : cell.getCellType(); + private ColumnType calculateColumnTypeFromCell(Cell cell) { + CellType cellType = + cell.getCellType() == FORMULA ? cell.getCachedFormulaResultType() : cell.getCellType(); switch (cellType) { case STRING: return ColumnType.STRING; @@ -257,9 +260,10 @@ private Table createTable(Sheet sheet, TableRange tableArea, XlsxReadOptions opt Cell cell = row.getCell(colNum + tableArea.startColumn, MissingCellPolicy.RETURN_BLANK_AS_NULL); Column column = columns.get(colNum); + String columnName = headerNames.get(colNum); if (cell != null) { if (column == null) { - column = createColumn(headerNames.get(colNum), cell); + column = createColumn(colNum, columnName, cell, options); columns.set(colNum, column); while (column.size() < rowNum - tableArea.startRow) { column.appendMissing(); @@ -270,6 +274,17 @@ private Table createTable(Sheet sheet, TableRange tableArea, XlsxReadOptions opt column = altColumn; columns.set(colNum, column); } + } else { + boolean hasCustomizedType = + options.columnTypeReadOptions().columnType(colNum, columnName).isPresent(); + if (column == null && hasCustomizedType) { + ColumnType columnType = + options.columnTypeReadOptions().columnType(colNum, columnName).get(); + column = columnType.create(columnName).appendMissing(); + columns.set(colNum, column); + } else if (hasCustomizedType) { + column.appendMissing(); + } } if (column != null) { while (column.size() <= rowNum - tableArea.startRow) { @@ -285,7 +300,8 @@ private Table createTable(Sheet sheet, TableRange tableArea, XlsxReadOptions opt @SuppressWarnings("unchecked") private Column appendValue(Column column, Cell cell) { - CellType cellType = cell.getCellType() == FORMULA ? cell.getCachedFormulaResultType() : cell.getCellType(); + CellType cellType = + cell.getCellType() == FORMULA ? cell.getCachedFormulaResultType() : cell.getCellType(); switch (cellType) { case STRING: column.appendCell(cell.getRichStringCellValue().getString()); @@ -297,7 +313,20 @@ private Column appendValue(Column column, Cell cell) { // behavior LocalDateTime localDate = date.toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime(); - column.appendCell(localDate.toString()); + if (column.type() == ColumnType.STRING) { + // If column has String type try to honor it and leave the value as an string as similar + // as posible as seen in Excel + String dataFormatStyle = cell.getCellStyle().getDataFormatString(); + String val; + if ("general".equalsIgnoreCase(dataFormatStyle)) { + val = new CellGeneralFormatter().format(cell.getNumericCellValue()); + } else { + val = new CellDateFormatter(dataFormatStyle).format(cell.getDateCellValue()); + } + column.appendCell(val); + } else { + column.appendCell(localDate.toString()); + } return null; } else { double num = cell.getNumericCellValue(); @@ -332,6 +361,18 @@ private Column appendValue(Column column, Cell cell) { Column doubleColumn = (Column) column; doubleColumn.append(num); return null; + } else if (column.type() == ColumnType.STRING) { + // If column has String type try to honor it and leave the value as an string as similar + // as posible as seen in Excel + Column stringColumn = (Column) column; + String dataFormatStyle = cell.getCellStyle().getDataFormatString(); + String val; + if ("general".equalsIgnoreCase(dataFormatStyle)) { + val = new CellGeneralFormatter().format(cell.getNumericCellValue()); + } else { + val = new CellNumberFormatter(dataFormatStyle).format(cell.getNumericCellValue()); + } + stringColumn.append(val); } } break; @@ -340,6 +381,12 @@ private Column appendValue(Column column, Cell cell) { Column booleanColumn = (Column) column; booleanColumn.append(cell.getBooleanCellValue()); return null; + } else if (column.type() == ColumnType.STRING) { + // If column has String type try to honor it and leave the value as an string as similar + // as posible as seen in Excel + Column stringColumn = (Column) column; + String val = new CellGeneralFormatter().format(cell.getBooleanCellValue()); + stringColumn.append(val); } default: break; @@ -347,9 +394,14 @@ private Column appendValue(Column column, Cell cell) { return null; } - private Column createColumn(String name, Cell cell) { + private Column createColumn(int colNum, String name, Cell cell, XlsxReadOptions options) { Column column; - ColumnType columnType = getColumnType(cell); + + ColumnType columnType = + options + .columnTypeReadOptions() + .columnType(colNum, name) + .orElse(calculateColumnTypeFromCell(cell)); if (columnType == null) { columnType = ColumnType.STRING; } diff --git a/excel/src/test/java/tech/tablesaw/io/xlsx/XlsxReaderTest.java b/excel/src/test/java/tech/tablesaw/io/xlsx/XlsxReaderTest.java index 69f2c484e..e789b30c0 100644 --- a/excel/src/test/java/tech/tablesaw/io/xlsx/XlsxReaderTest.java +++ b/excel/src/test/java/tech/tablesaw/io/xlsx/XlsxReaderTest.java @@ -14,15 +14,28 @@ package tech.tablesaw.io.xlsx; -import org.junit.jupiter.api.Test; -import tech.tablesaw.api.Table; -import tech.tablesaw.columns.Column; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; +import static tech.tablesaw.api.ColumnType.BOOLEAN; +import static tech.tablesaw.api.ColumnType.DOUBLE; +import static tech.tablesaw.api.ColumnType.FLOAT; +import static tech.tablesaw.api.ColumnType.LOCAL_DATE_TIME; +import static tech.tablesaw.api.ColumnType.LONG; +import static tech.tablesaw.api.ColumnType.STRING; +import com.google.common.collect.Lists; import java.io.IOException; import java.time.LocalDateTime; +import java.util.Arrays; import java.util.List; - -import static org.junit.jupiter.api.Assertions.*; +import java.util.Optional; +import org.junit.jupiter.api.Test; +import tech.tablesaw.api.ColumnType; +import tech.tablesaw.api.Table; +import tech.tablesaw.columns.Column; public class XlsxReaderTest { @@ -84,7 +97,8 @@ public void testColumns() { "booleancol", "datecol", "formulacol"); - // stringcol shortcol intcol longcol doublecol booleancol datecol formulacol + // stringcol shortcol intcol longcol doublecol booleancol datecol + // formulacol // Hallvard 123 12345678 12345678900 12,34 TRUE 22/02/2019 20:54:09 135.34 // Marit 124 12345679 12345678901 13,35 FALSE 23/03/2020 21:55:10 137.35 assertColumnValues(table.stringColumn("stringcol"), "Hallvard", "Marit"); @@ -93,8 +107,11 @@ public void testColumns() { assertColumnValues(table.longColumn("longcol"), 12345678900L, 12345678901L); assertColumnValues(table.doubleColumn("doublecol"), 12.34, 13.35); assertColumnValues(table.booleanColumn("booleancol"), true, false); - assertColumnValues(table.dateTimeColumn("datecol"), LocalDateTime.of(2019, 2, 22, 20, 54, 9), LocalDateTime.of(2020, 3, 23, 21, 55, 10)); - assertColumnValues(table.doubleColumn("formulacol"), 135.34 , 137.35); + assertColumnValues( + table.dateTimeColumn("datecol"), + LocalDateTime.of(2019, 2, 22, 20, 54, 9), + LocalDateTime.of(2020, 3, 23, 21, 55, 10)); + assertColumnValues(table.doubleColumn("formulacol"), 135.34, 137.35); } @Test @@ -122,9 +139,9 @@ public void testColumnsWithMissingValues() { assertColumnValues(table.longColumn("longcol"), 12345678900L, null); assertColumnValues(table.doubleColumn("doublecol"), null, 13.35); assertColumnValues(table.booleanColumn("booleancol"), true, null); - assertColumnValues(table.dateTimeColumn("datecol"), LocalDateTime.of(2019, 2, 22, 20, 54, 9), null); - assertColumnValues(table.doubleColumn("formulacol"), null ,137.35); - + assertColumnValues( + table.dateTimeColumn("datecol"), LocalDateTime.of(2019, 2, 22, 20, 54, 9), null); + assertColumnValues(table.doubleColumn("formulacol"), null, 137.35); } @Test @@ -157,4 +174,80 @@ public void testSheetIndex() throws IOException { // expected } } + + @Test + public void testCustomizedColumnTypesMixedWithDetection() throws IOException { + Table table = + new XlsxReader() + .read( + XlsxReadOptions.builder("../data/columns.xlsx") + .columnType("shortcol", DOUBLE) + .columnType("intcol", LONG) + .columnTypeByNameFunction( + columnName -> + "formulacol".equals(columnName) ? Optional.of(FLOAT) : Optional.empty()) + .build()); + + ColumnType[] columnTypes = table.columnTypes(); + + assertArrayEquals( + columnTypes, + new ColumnType[] {STRING, DOUBLE, LONG, LONG, DOUBLE, BOOLEAN, LOCAL_DATE_TIME, FLOAT}); + } + + @Test + public void testCustomizedColumnTypeAllCustomized() throws IOException { + Table table = + new XlsxReader() + .read( + XlsxReadOptions.builder("../data/columns.xlsx") + .completeColumnTypeByNameFunction(columName -> STRING) + .build()); + + ColumnType[] columnTypes = table.columnTypes(); + + assertTrue(Arrays.stream(columnTypes).allMatch(columnType -> columnType.equals(STRING))); + } + + @Test + public void testCustomizedEmptyColumnsArePreserved() throws IOException { + Table table = + new XlsxReader() + .read( + XlsxReadOptions.builder("../data/columns.xlsx") + .completeColumnTypeByNameFunction(columName -> STRING) + .build()); + + assertEquals( + table.column("empty").type(), + STRING, + "Empty column must be preserved as it's type is specified"); + } + + @Test + public void testCustomizedColumnStringShouldTryToPreserveValuesFromOtherExcelTypes() + throws IOException { + Table table = + new XlsxReader() + .read( + XlsxReadOptions.builder("../data/columns.xlsx") + .completeColumnTypeByNameFunction(columName -> STRING) + .build()); + + System.out.println(table.print()); + + assertEquals(table.column("stringcol").asList(), Lists.newArrayList("Hallvard", "Marit")); + assertEquals(table.column("intcol").asList(), Lists.newArrayList("12345678", "12345679")); + // Not ideal, format viewed in excel is without E+10 notation + assertEquals( + table.column("longcol").asList(), Lists.newArrayList("1.23457E+10", "1.23457E+10")); + assertEquals(table.column("doublecol").asList(), Lists.newArrayList("12.34", "13.35")); + assertEquals(table.column("booleancol").asList(), Lists.newArrayList("TRUE", "FALSE")); + assertEquals(table.column("booleancol").asList(), Lists.newArrayList("TRUE", "FALSE")); + assertEquals( + table.column("datecol").asList(), + Lists.newArrayList("22/02/2019 20:54:09", "23/03/2020 21:55:10")); + assertEquals(table.column("formulacol").asList(), Lists.newArrayList("135.34", "137.35")); + assertEquals(table.column("empty").asList(), Lists.newArrayList("", "")); + } } diff --git a/html/src/main/java/tech/tablesaw/io/html/HtmlReadOptions.java b/html/src/main/java/tech/tablesaw/io/html/HtmlReadOptions.java index 38c26a410..218d76150 100644 --- a/html/src/main/java/tech/tablesaw/io/html/HtmlReadOptions.java +++ b/html/src/main/java/tech/tablesaw/io/html/HtmlReadOptions.java @@ -8,6 +8,10 @@ import java.net.URL; import java.time.format.DateTimeFormatter; import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; +import tech.tablesaw.api.ColumnType; import tech.tablesaw.io.ReadOptions; import tech.tablesaw.io.Source; @@ -185,5 +189,31 @@ public Builder tableIndex(int tableIndex) { this.tableIndex = tableIndex; return this; } + + @Override + public Builder columnType(String columnName, ColumnType columnType) { + super.columnType(columnName, columnType); + return this; + } + + @Override + public Builder columnTypeByNameFunction( + Function> columnTypeFunction) { + super.columnTypeByNameFunction(columnTypeFunction); + return this; + } + + @Override + public Builder completeColumnTypeByNameFunction( + Function columnTypeFunction) { + super.completeColumnTypeByNameFunction(columnTypeFunction); + return this; + } + + @Override + public Builder columnTypes(Map columnTypeByName) { + super.columnTypes(columnTypeByName); + return this; + } } } diff --git a/json/src/main/java/tech/tablesaw/io/json/JsonReadOptions.java b/json/src/main/java/tech/tablesaw/io/json/JsonReadOptions.java index 692527ee0..9858325a1 100644 --- a/json/src/main/java/tech/tablesaw/io/json/JsonReadOptions.java +++ b/json/src/main/java/tech/tablesaw/io/json/JsonReadOptions.java @@ -8,6 +8,10 @@ import java.net.URL; import java.time.format.DateTimeFormatter; import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; +import tech.tablesaw.api.ColumnType; import tech.tablesaw.io.ReadOptions; import tech.tablesaw.io.Source; @@ -185,5 +189,31 @@ public Builder path(String path) { this.path = path; return this; } + + @Override + public Builder columnType(String columnName, ColumnType columnType) { + super.columnType(columnName, columnType); + return this; + } + + @Override + public Builder columnTypeByNameFunction( + Function> columnTypeFunction) { + super.columnTypeByNameFunction(columnTypeFunction); + return this; + } + + @Override + public Builder completeColumnTypeByNameFunction( + Function columnTypeFunction) { + super.completeColumnTypeByNameFunction(columnTypeFunction); + return this; + } + + @Override + public Builder columnTypes(Map columnTypeByName) { + super.columnTypes(columnTypeByName); + return this; + } } } diff --git a/json/src/test/java/tech/tablesaw/io/json/JsonReaderTest.java b/json/src/test/java/tech/tablesaw/io/json/JsonReaderTest.java index 63645c63e..dd0d73ab1 100644 --- a/json/src/test/java/tech/tablesaw/io/json/JsonReaderTest.java +++ b/json/src/test/java/tech/tablesaw/io/json/JsonReaderTest.java @@ -14,8 +14,13 @@ package tech.tablesaw.io.json; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; +import static tech.tablesaw.api.ColumnType.DOUBLE; +import static tech.tablesaw.api.ColumnType.INSTANT; +import static tech.tablesaw.api.ColumnType.STRING; +import java.io.IOException; import org.junit.jupiter.api.Test; import tech.tablesaw.api.ColumnType; import tech.tablesaw.api.IntColumn; @@ -75,4 +80,33 @@ public void arrayOfRowsWithIncompleteIndexes() { assertEquals(expected.column("B").asList(), actual.column("B").asList()); assertEquals(expected.column("C").asList(), actual.column("C").asList()); } + + @Test + public void testCustomizedColumnTypesMixedWithDetection() throws IOException { + String json = + "[[\"Date\",\"Value\"],[\"2007-12-03T10:15:30.00Z\",-2.1448117025014],[\"2020-12-03T10:15:30.00Z\",-2.9763153817574],[\"2021-12-03T10:15:30.00Z\",-2.9545283436391]]"; + + ColumnType[] columnTypes = + new JsonReader() + .read(JsonReadOptions.builderFromString(json).columnType("Date", INSTANT).build()) + .columnTypes(); + + assertArrayEquals(columnTypes, new ColumnType[] {INSTANT, DOUBLE}); + } + + @Test + public void testCustomizedColumnTypeAllCustomized() throws IOException { + String json = + "[[\"Date\",\"Value\"],[\"2007-12-03T10:15:30.00Z\",-2.1448117025014],[\"2020-12-03T10:15:30.00Z\",-2.9763153817574],[\"2021-12-03T10:15:30.00Z\",-2.9545283436391]]"; + + ColumnType[] columnTypes = + new JsonReader() + .read( + JsonReadOptions.builderFromString(json) + .completeColumnTypeByNameFunction(columnName -> STRING) + .build()) + .columnTypes(); + + assertArrayEquals(columnTypes, new ColumnType[] {STRING, STRING}); + } }