Basic changes to File_Format (#3516)

Implements https://www.pivotaltracker.com/story/show/182308987
enso-org · Jun 8, 2022 · 2af970f · 2af970f
1 parent b1db359
commit 2af970f
Show file tree

Hide file tree

Showing 13 changed files with 169 additions and 81 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -134,6 +134,8 @@
 - [Added rank data, correlation and covariance statistics for `Vector`][3484]
 - [Implemented `Table.order_by` for the SQLite backend.][3502]
 - [Implemented `Table.order_by` for the PostgreSQL backend.][3514]
+- [Renamed `File_Format.Text` to `Plain_Text`, updated `File_Format.Delimited`
+  API and added builders for customizing less common settings.][3516]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -210,6 +212,7 @@
 [3484]: https://github.com/enso-org/enso/pull/3484
 [3502]: https://github.com/enso-org/enso/pull/3502
 [3514]: https://github.com/enso-org/enso/pull/3514
+[3516]: https://github.com/enso-org/enso/pull/3516
 
 #### Enso Compiler
 

diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Locale.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Locale.enso
@@ -414,6 +414,12 @@ type Locale
         if this.variant.is_nothing.not then b.append ["variant", this.variant]
         Json.from_pairs b.to_vector
 
+    ## Compares two locales for equality.
+    == : Any -> Boolean
+    == other = case other of
+        Locale other_java_locale -> this.java_locale.equals other_java_locale
+        _ -> False
+
 ## PRIVATE
 
    Convert a java locale to an Enso locale.

diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Dialect/Postgres.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Dialect/Postgres.enso
@@ -235,7 +235,7 @@ make_order_descriptor internal_column sort_direction text_ordering =
                         IR.Order_Descriptor internal_column.expression sort_direction nulls_order=nulls collation=Nothing
                     True ->
                         IR.Order_Descriptor internal_column.expression sort_direction nulls_order=nulls collation="ucs_basic"
-                    Case_Insensitive locale -> case Locale.default.java_locale.equals locale.java_locale of
+                    Case_Insensitive locale -> case locale == Locale.default of
                         False ->
                             Error.throw (Unsupported_Database_Operation_Error "Case insensitive ordering with custom locale is currently not supported. You may need to materialize the Table to perform this operation.")
                         True ->

diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Dialect/Sqlite.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Dialect/Sqlite.enso
@@ -58,7 +58,7 @@ type Sqlite_Dialect
                         IR.Order_Descriptor internal_column.expression sort_direction collation=Nothing
                     True ->
                         IR.Order_Descriptor internal_column.expression sort_direction collation="BINARY"
-                    Case_Insensitive locale -> case Locale.default.java_locale.equals locale.java_locale of
+                    Case_Insensitive locale -> case locale == Locale.default of
                         False ->
                             Error.throw (Unsupported_Database_Operation_Error "Case insensitive ordering with custom locale is not supported by the SQLite backend. You may need to materialize the Table to perform this operation.")
                         True ->

diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso
@@ -154,3 +154,10 @@ type Leading_Zeros column:Text (datatype:(Integer|Number|Date|Time|Time_Of_Day|B
    a parse is attempted anyway. If mixed types are requested, the column is not
    parsed due to ambiguity.
 type Duplicate_Type_Selector column:Text ambiguous:Boolean
+
+## Indicates that the given file type is not supported by the `Auto` format.
+type Unsupported_File_Type filename
+
+Unsupported_File_Type.to_display_text : Text
+Unsupported_File_Type.to_display_text =
+    "The "+this.filename+" has a type that is not supported by the Auto format."
diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso
@@ -94,10 +94,6 @@ read_from_reader format java_reader on_problems max_columns=4096 =
         True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS
         Infer -> DelimitedReader.HeaderBehavior.INFER
         False -> DelimitedReader.HeaderBehavior.GENERATE_HEADERS
-    skip_rows = case format.skip_rows of
-        Nothing -> 0
-        Integer -> format.skip_rows
-        _ -> Error.throw (Illegal_Argument_Error "`skip_rows` should be Integer or Nothing.")
     row_limit = case format.row_limit of
         Nothing -> -1
         Integer -> format.row_limit
@@ -127,7 +123,7 @@ read_from_reader format java_reader on_problems max_columns=4096 =
         cell_type_guesser = if format.headers != Infer then Nothing else
             formatter = format.value_formatter.if_nothing Data_Formatter
             TypeInferringParser.new formatter.get_specific_type_parsers.to_array IdentityParser.new
-        reader = DelimitedReader.new java_reader format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors
+        reader = DelimitedReader.new java_reader format.delimiter format.quote format.quote_escape java_headers format.skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors
         result_with_problems = reader.read
         parsing_problems = Vector.Vector (result_with_problems.problems) . map here.translate_reader_problem
         on_problems.attach_problems_after (Table.Table result_with_problems.value) parsing_problems

diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Excel.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Excel.enso
@@ -20,11 +20,11 @@ type Excel_Section
 
     ## Gets the data from a specific sheet. Column names are the Excel column
        names.
-    type Sheet (sheet:(Integer|Text)) (skip_rows:(Integer|Nothing)=Nothing) (row_limit:(Integer|Nothing)=Nothing)
+    type Sheet (sheet:(Integer|Text)) (skip_rows:Integer=0) (row_limit:(Integer|Nothing)=Nothing)
 
     ## Gets a specific range (taking either a defined name or external style
        address) from the workbook.
-    type Range (address:(Text|Excel_Range)) (skip_rows:(Integer|Nothing)=Nothing) (row_limit:(Integer|Nothing)=Nothing)
+    type Range (address:(Text|Excel_Range)) (skip_rows:Integer=0) (row_limit:(Integer|Nothing)=Nothing)
 
 type Excel_Range
     ## Specifies a range within an Excel Workbook.

diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso
@@ -6,9 +6,11 @@ from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Prob
 from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding
 import Standard.Base.Runtime.Ref
 import Standard.Table.Internal.Delimited_Reader
+from Standard.Table.Error as Table_Errors import Unsupported_File_Type
 
 from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
 import Standard.Table.Io.Excel as Excel_Module
+import Standard.Table.Io.Quote_Style
 
 ## This type needs to be here to allow for the usage of Standard.Table
    functions. Ideally, it would be an interface within Standard.Base and
@@ -24,17 +26,18 @@ type Auto
     materialise file =
         extension = file.extension
 
-        output = Ref.new File_Format.Bytes
-        if ".txt".equals_ignore_case extension then output.put File_Format.Text
-        if ".log".equals_ignore_case extension then output.put File_Format.Text
+        output = Ref.new Nothing
+        if ".txt".equals_ignore_case extension then output.put File_Format.Plain_Text
+        if ".log".equals_ignore_case extension then output.put File_Format.Plain_Text
         if ".csv".equals_ignore_case extension then output.put (File_Format.Delimited ',')
         if ".tsv".equals_ignore_case extension then output.put (File_Format.Delimited '\t')
         if ".xlsx".equals_ignore_case extension then output.put File_Format.Excel
         if ".xlsm".equals_ignore_case extension then output.put File_Format.Excel
         if ".xls".equals_ignore_case extension then output.put File_Format.Excel
         if ".xlt".equals_ignore_case extension then output.put File_Format.Excel
 
-        output.get
+        output.get.if_nothing <|
+            Error.throw (Unsupported_File_Type file.name)
 
     ## Implements the `File.read` for this `File_Format`
     read : File -> Problem_Behavior -> Any
@@ -52,8 +55,8 @@ type Bytes
         file.read_bytes
 
 ## Reads the file to a `Text` with specified encoding.
-type Text
-    type Text (encoding:Encoding=Encoding.utf_8)
+type Plain_Text
+    type Plain_Text (encoding:Encoding=Encoding.utf_8)
 
     ## Implements the `File.read` for this `File_Format`
     read : File -> Problem_Behavior -> Any
@@ -72,6 +75,9 @@ type Delimited
        - delimiter: The delimiter character to split the file into columns. An
          `Illegal_Argument_Error` error is returned if this is an empty string.
        - encoding: The encoding to use when reading the file.
+       - skip_rows: The number of rows to skip from the top of the file.
+       - row_limit: The maximum number of rows to read from the file. This count
+         does not include the header row (if applicable).
        - quote: The quote character denotes the start and end of a quoted value.
          No quote character is used if set to `Nothing`. Quoted items are not
          split on the delimiter and can also contain newlines. Within a quoted
@@ -83,27 +89,58 @@ type Delimited
          then escaping quotes is done by double quotes: `"ab""cd"` will yield
          the text `ab"cd"`. Another popular choice for `quote_escape` is the `\`
          character. Then `"ab\"cd"` will yield the same text.
+       - quote_style: The style of quoting to use when writing the file.
        - headers: If set to `True`, the first row is used as column names. If
          set to `False`, the column names are generated by adding increasing
          numeric suffixes to the base name `Column` (i.e. `Column_1`,
          `Column_2` etc.). If set to `Infer`, the process tries to infer if
          headers are present on the first row. If the column names are not
          unique, numeric suffixes will be appended to disambiguate them.
-       - skip_rows: The number of rows to skip from the top of the file.
-       - row_limit: The maximum number of rows to read from the file. This count
-         does not include the header row (if applicable).
        - value_formatter: Formatter to parse text values into numbers, dates,
          times, etc. If `Nothing` values are left as Text.
        - keep_invalid_rows: Specifies whether rows that contain less or more
          columns than expected should be kept (setting the missing columns to
          `Nothing` or dropping the excess columns) or dropped.
-    type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (headers:True|False|Infer=Infer) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True)
+    type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (skip_rows:Integer=0) (row_limit:Integer|Nothing=Nothing) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (quote_style:Quote_Style=Quote_Style.Necessary) (headers:True|False|Infer=Infer) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True)
 
     ## Implements the `File.read` for this `File_Format`
     read : File -> Problem_Behavior -> Any
     read file on_problems =
         Delimited_Reader.read_file this file on_problems
 
+    ## PRIVATE
+     Clone the instance with some properties overridden.
+     Note: This function is internal until such time as Atom cloning with modification is built into Enso.
+    clone : Text->Text->(Boolean|Infer)->Data_Formatter->Boolean->Delimited
+    clone (quote=this.quote) (quote_escape=this.quote_escape) (quote_style=this.quote_style) (headers=this.headers) (value_formatter=this.value_formatter) (keep_invalid_rows=this.keep_invalid_rows) =
+        Delimited this.delimiter this.encoding this.skip_rows this.row_limit quote quote_escape quote_style headers value_formatter keep_invalid_rows
+
+    ## Create a clone of this with specified `quote` and `quote_escape`.
+    with_quotes : Text->Text->Quote_Style->Delimited
+    with_quotes quote quote_escape=quote quote_style=this.quote_style =
+        this.clone quote=quote quote_escape=quote_escape quote_style=quote_style
+
+    ## Create a clone of this with first row treated as header.
+    with_headers : Delimited
+    with_headers = this.clone headers=True
+
+    ## Create a clone of this where the first row is treated as data, not a
+       header.
+    without_headers : Delimited
+    without_headers = this.clone headers=False
+
+    ## Create a clone of this with value parsing.
+
+       A custom `Data_Formatter` can be provided to customize parser options.
+    with_parsing : Data_Formatter -> Delimited
+    with_parsing (value_formatter=Data_Formatter) =
+        this.clone value_formatter=value_formatter
+
+    ## Create a clone of this without value parsing.
+    without_parsing : Delimited
+    without_parsing =
+        this.clone value_formatter=Nothing
+
 ## A setting to infer the default behaviour of some option.
 type Infer
 

diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Quote_Style.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Quote_Style.enso
@@ -0,0 +1,9 @@
+type Quote_Style
+    ## Do not quote any values even if this will result in an invalid file.
+    type Never
+
+    ## Quote text values which are empty or contain the delimiter or new lines.
+    type Necessary
+
+    ## Quote all text values.
+    type Always
diff --git a/std-bits/table/src/main/java/org/enso/table/format/xlsx/Reader.java b/std-bits/table/src/main/java/org/enso/table/format/xlsx/Reader.java
@@ -352,7 +352,7 @@ public static String[] readRangeNames(InputStream stream, boolean xls_format) th
   public static Table readSheetByName(
       InputStream stream,
       String sheetName,
-      Integer skip_rows,
+      int skip_rows,
       Integer row_limit,
       boolean xls_format)
       throws IOException, IllegalArgumentException {
@@ -367,7 +367,7 @@ public static Table readSheetByName(
         workbook,
         sheetIndex,
         null,
-        skip_rows == null ? 0 : skip_rows,
+        skip_rows,
         row_limit == null ? Integer.MAX_VALUE : row_limit);
   }
 
@@ -383,7 +383,7 @@ public static Table readSheetByName(
    * @throws IOException when the input stream cannot be read.
    */
   public static Table readSheetByIndex(
-      InputStream stream, int index, Integer skip_rows, Integer row_limit, boolean xls_format)
+      InputStream stream, int index, int skip_rows, Integer row_limit, boolean xls_format)
       throws IOException, IllegalArgumentException {
     Workbook workbook = getWorkbook(stream, xls_format);
 
@@ -397,7 +397,7 @@ public static Table readSheetByIndex(
         workbook,
         index - 1,
         null,
-        skip_rows == null ? 0 : skip_rows,
+        skip_rows,
         row_limit == null ? Integer.MAX_VALUE : row_limit);
   }
 
@@ -415,7 +415,7 @@ public static Table readSheetByIndex(
   public static Table readRangeByName(
       InputStream stream,
       String rangeNameOrAddress,
-      Integer skip_rows,
+      int skip_rows,
       Integer row_limit,
       boolean xls_format)
       throws IOException {
@@ -438,7 +438,7 @@ public static Table readRangeByName(
    * @throws IOException when the input stream cannot be read.
    */
   public static Table readRange(
-      InputStream stream, Range range, Integer skip_rows, Integer row_limit, boolean xls_format)
+      InputStream stream, Range range, int skip_rows, Integer row_limit, boolean xls_format)
       throws IOException {
     return readRange(getWorkbook(stream, xls_format), range, skip_rows, row_limit);
   }
@@ -448,7 +448,7 @@ private static Workbook getWorkbook(InputStream stream, boolean xls_format) thro
   }
 
   private static Table readRange(
-      Workbook workbook, Range range, Integer skip_rows, Integer row_limit) {
+      Workbook workbook, Range range, int skip_rows, Integer row_limit) {
     int sheetIndex = getSheetIndex(workbook, range.getSheetName());
     if (sheetIndex == -1) {
       throw new IllegalArgumentException("Unknown sheet '" + range.getSheetName() + "'.");
@@ -458,7 +458,7 @@ private static Table readRange(
         workbook,
         sheetIndex,
         range,
-        skip_rows == null ? 0 : skip_rows,
+        skip_rows,
         row_limit == null ? Integer.MAX_VALUE : row_limit);
   }
 }