diff --git a/airbyte-config-oss/init-oss/src/main/resources/seed/oss_catalog.json b/airbyte-config-oss/init-oss/src/main/resources/seed/oss_catalog.json index 05d58dc12c4c7..e5acc5fdb9db9 100644 --- a/airbyte-config-oss/init-oss/src/main/resources/seed/oss_catalog.json +++ b/airbyte-config-oss/init-oss/src/main/resources/seed/oss_catalog.json @@ -12862,7 +12862,7 @@ "sourceDefinitionId": "778daa7c-feaf-4db6-96f3-70fd645acc77", "name": "File (CSV, JSON, Excel, Feather, Parquet)", "dockerRepository": "airbyte/source-file", - "dockerImageTag": "0.2.38", + "dockerImageTag": "0.3.0", "documentationUrl": "https://docs.airbyte.com/integrations/sources/file", "icon": "file.svg", "sourceType": "file", diff --git a/airbyte-config-oss/init-oss/src/main/resources/seed/source_definitions.yaml b/airbyte-config-oss/init-oss/src/main/resources/seed/source_definitions.yaml index acf110451fe27..f0fda6cd38ea4 100644 --- a/airbyte-config-oss/init-oss/src/main/resources/seed/source_definitions.yaml +++ b/airbyte-config-oss/init-oss/src/main/resources/seed/source_definitions.yaml @@ -637,7 +637,7 @@ - name: File (CSV, JSON, Excel, Feather, Parquet) sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77 dockerRepository: airbyte/source-file - dockerImageTag: 0.2.38 + dockerImageTag: 0.3.0 documentationUrl: https://docs.airbyte.com/integrations/sources/file icon: file.svg sourceType: file diff --git a/airbyte-config-oss/init-oss/src/main/resources/seed/source_specs.yaml b/airbyte-config-oss/init-oss/src/main/resources/seed/source_specs.yaml index 826b03081288e..0feb5289286cf 100644 --- a/airbyte-config-oss/init-oss/src/main/resources/seed/source_specs.yaml +++ b/airbyte-config-oss/init-oss/src/main/resources/seed/source_specs.yaml @@ -4555,7 +4555,7 @@ supportsNormalization: false supportsDBT: false supported_destination_sync_modes: [] -- dockerImage: "airbyte/source-file:0.2.38" +- dockerImage: "airbyte/source-file:0.3.0" spec: documentationUrl: "https://docs.airbyte.com/integrations/sources/file" connectionSpecification: diff --git a/airbyte-integrations/connectors/source-file-secure/Dockerfile b/airbyte-integrations/connectors/source-file-secure/Dockerfile index 94f23e7b42048..1d8ce1a64c70b 100644 --- a/airbyte-integrations/connectors/source-file-secure/Dockerfile +++ b/airbyte-integrations/connectors/source-file-secure/Dockerfile @@ -9,5 +9,5 @@ RUN pip install . ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.38 +LABEL io.airbyte.version=0.3.0 LABEL io.airbyte.name=airbyte/source-file-secure diff --git a/airbyte-integrations/connectors/source-file/Dockerfile b/airbyte-integrations/connectors/source-file/Dockerfile index 108dcabda7b31..51bf5b47d7643 100644 --- a/airbyte-integrations/connectors/source-file/Dockerfile +++ b/airbyte-integrations/connectors/source-file/Dockerfile @@ -17,5 +17,5 @@ COPY source_file ./source_file ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.38 +LABEL io.airbyte.version=0.3.0 LABEL io.airbyte.name=airbyte/source-file diff --git a/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test_utf16.csv b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test_utf16.csv index e786a4a6567ae..7d5a29f4ccc0b 100644 Binary files a/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test_utf16.csv and b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test_utf16.csv differ diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py index ee78740ebed22..9280749911965 100644 --- a/airbyte-integrations/connectors/source-file/source_file/client.py +++ b/airbyte-integrations/connectors/source-file/source_file/client.py @@ -364,6 +364,8 @@ def dtype_to_json_type(current_type: str, dtype) -> str: return "number" if dtype == "bool" and (not current_type or current_type == "boolean"): return "boolean" + if dtype == "datetime64[ns]": + return "datetime" return "string" @property @@ -419,8 +421,14 @@ def _stream_properties(self, fp, empty_schema: bool = False, read_sample_chunk: for col in df.columns: # if data type of the same column differs in dataframes, we choose the broadest one prev_frame_column_type = fields.get(col) - fields[col] = self.dtype_to_json_type(prev_frame_column_type, df[col].dtype) - return {field: {"type": [fields[field], "null"]} for field in fields} + df_type = df[col].dtype + fields[col] = self.dtype_to_json_type(prev_frame_column_type, df_type) + return { + field: ( + {"type": ["string", "null"], "format": "datetime"} if fields[field] == "datetime" else {"type": [fields[field], "null"]} + ) + for field in fields + } def streams(self, empty_schema: bool = False) -> Iterable: """Discovers available streams""" diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py index ac6a0fb16ddf1..74d09092afef5 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py @@ -93,6 +93,7 @@ def test_load_nested_json(client, absolute_path, test_files): ("boolean", "bool", "boolean"), ("number", "int64", "number"), ("number", "float64", "number"), + ("number", "datetime64[ns]", "datetime"), ], ) def test_dtype_to_json_type(client, current_type, dtype, expected): diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_source.py b/airbyte-integrations/connectors/source-file/unit_tests/test_source.py index f2365baa961db..dfd526b3ab950 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/test_source.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/test_source.py @@ -42,7 +42,7 @@ def test_csv_with_utf16_encoding(absolute_path, test_files): config_local_csv_utf16 = { "dataset_name": "AAA", "format": "csv", - "reader_options": '{"encoding":"utf_16"}', + "reader_options": '{"encoding":"utf_16", "parse_dates": [\"header5\"]}', "url": f"{absolute_path}/{test_files}/test_utf16.csv", "provider": {"storage": "local"}, } @@ -53,6 +53,7 @@ def test_csv_with_utf16_encoding(absolute_path, test_files): "header2": {"type": ["number", "null"]}, "header3": {"type": ["number", "null"]}, "header4": {"type": ["boolean", "null"]}, + "header5": {"type": ["string", "null"], "format": "datetime"}, }, "type": "object", } diff --git a/connectors.md b/connectors.md index a4c53aa82e169..74a3e33d62f69 100644 --- a/connectors.md +++ b/connectors.md @@ -75,7 +75,7 @@ | **Facebook Pages** | Facebook Pages icon | Source | airbyte/source-facebook-pages:0.2.4 | beta | [docs](https://docs.airbyte.com/integrations/sources/facebook-pages) | [connectors/source/facebook-pages](https://github.com/airbytehq/airbyte/issues?q=is:open+is:issue+label:connectors/source/facebook-pages) | [source-facebook-pages](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-facebook-pages) | `010eb12f-837b-4685-892d-0a39f76a98f5` | | **Fastbill** | Fastbill icon | Source | airbyte/source-fastbill:0.1.0 | alpha | [docs](https://docs.airbyte.com/integrations/sources/fastbill) | [connectors/source/fastbill](https://github.com/airbytehq/airbyte/issues?q=is:open+is:issue+label:connectors/source/fastbill) | [source-fastbill](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-fastbill) | `eb3e9c1c-0467-4eb7-a172-5265e04ccd0a` | | **Fauna** | Fauna icon | Source | airbyte/source-fauna:0.1.1 | alpha | [docs](https://docs.airbyte.com/integrations/sources/fauna) | [connectors/source/fauna](https://github.com/airbytehq/airbyte/issues?q=is:open+is:issue+label:connectors/source/fauna) | [source-fauna](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-fauna) | `3825db3e-c94b-42ac-bd53-b5a9507ace2b` | -| **File (CSV, JSON, Excel, Feather, Parquet)** | File (CSV, JSON, Excel, Feather, Parquet) icon | Source | airbyte/source-file:0.2.38 | generally_available | [docs](https://docs.airbyte.com/integrations/sources/file) | [connectors/source/file](https://github.com/airbytehq/airbyte/issues?q=is:open+is:issue+label:connectors/source/file) | [source-file](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-file) | `778daa7c-feaf-4db6-96f3-70fd645acc77` | +| **File (CSV, JSON, Excel, Feather, Parquet)** | File (CSV, JSON, Excel, Feather, Parquet) icon | Source | airbyte/source-file:0.3.0 | generally_available | [docs](https://docs.airbyte.com/integrations/sources/file) | [connectors/source/file](https://github.com/airbytehq/airbyte/issues?q=is:open+is:issue+label:connectors/source/file) | [source-file](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-file) | `778daa7c-feaf-4db6-96f3-70fd645acc77` | | **Firebase Realtime Database** | x | Source | airbyte/source-firebase-realtime-database:0.1.0 | alpha | [docs](https://docs.airbyte.io/integrations/sources/firebase-realtime-database) | [connectors/source/firebase-realtime-database](https://github.com/airbytehq/airbyte/issues?q=is:open+is:issue+label:connectors/source/firebase-realtime-database) | [source-firebase-realtime-database](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-firebase-realtime-database) | `acb5f973-a565-441e-992f-4946f3e65662` | | **Firebolt** | Firebolt icon | Source | airbyte/source-firebolt:0.2.0 | alpha | [docs](https://docs.airbyte.com/integrations/sources/firebolt) | [connectors/source/firebolt](https://github.com/airbytehq/airbyte/issues?q=is:open+is:issue+label:connectors/source/firebolt) | [source-firebolt](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-firebolt) | `6f2ac653-8623-43c4-8950-19218c7caf3d` | | **Flexport** | x | Source | airbyte/source-flexport:0.1.0 | alpha | [docs](https://docs.airbyte.com/integrations/sources/flexport) | [connectors/source/flexport](https://github.com/airbytehq/airbyte/issues?q=is:open+is:issue+label:connectors/source/flexport) | [source-flexport](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-flexport) | `f95337f1-2ad1-4baf-922f-2ca9152de630` | diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md index 1d9d7c1668bfd..a1828de16d906 100644 --- a/docs/integrations/sources/file.md +++ b/docs/integrations/sources/file.md @@ -79,12 +79,13 @@ For example, if the format `CSV` is selected, then options from the [read_csv](h - It is therefore possible to customize the `delimiter` (or `sep`) to in case of tab separated files. - Header line can be ignored with `header=0` and customized with `names` +- Parse dates for in specified columns - etc We would therefore provide in the `reader_options` the following json: ``` -{ "sep" : "\t", "header" : 0, "names": ["column1", "column2"]} +{ "sep" : "\t", "header" : 0, "names": ["column1", "column2"], "parse_dates": ["column2"]} ``` In case you select `JSON` format, then options from the [read_json](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-json-reader) reader are available. @@ -190,17 +191,18 @@ In order to read large files from a remote location, this connector uses the [sm | Version | Date | Pull Request | Subject | |:--------|:-----------|:---------------------------------------------------------|:--------------------------------------------------------------------------------------------------------| +| 0.3.0 | 2023-04-24 | [25445](https://github.com/airbytehq/airbyte/pull/25445) | Add datatime format parsing support for csv files | | 0.2.38 | 2023-04-12 | [23759](https://github.com/airbytehq/airbyte/pull/23759) | Fix column data types for numerical values | | 0.2.37 | 2023-04-06 | [24525](https://github.com/airbytehq/airbyte/pull/24525) | Fix examples in spec | | 0.2.36 | 2023-03-27 | [24588](https://github.com/airbytehq/airbyte/pull/24588) | Remove traceback from user messages. | | 0.2.35 | 2023-03-03 | [24278](https://github.com/airbytehq/airbyte/pull/24278) | Read only file header when checking connectivity; read only a single chunk when discovering the schema. | -| 0.2.34 | 2023-03-03 | [23723](https://github.com/airbytehq/airbyte/pull/23723) | Update description in spec, make user-friendly error messages and docs. | +| 0.2.34 | 2023-03-03 | [23723](https://github.com/airbytehq/airbyte/pull/23723) | Update description in spec, make user-friendly error messages and docs. | | 0.2.33 | 2023-01-04 | [21012](https://github.com/airbytehq/airbyte/pull/21012) | Fix special characters bug | | 0.2.32 | 2022-12-21 | [20740](https://github.com/airbytehq/airbyte/pull/20740) | Source File: increase SSH timeout to 60s | | 0.2.31 | 2022-11-17 | [19567](https://github.com/airbytehq/airbyte/pull/19567) | Source File: bump 0.2.31 | | 0.2.30 | 2022-11-10 | [19222](https://github.com/airbytehq/airbyte/pull/19222) | Use AirbyteConnectionStatus for "check" command | | 0.2.29 | 2022-11-08 | [18587](https://github.com/airbytehq/airbyte/pull/18587) | Fix pandas read_csv header none issue. | -| 0.2.28 | 2022-10-27 | [18428](https://github.com/airbytehq/airbyte/pull/18428) | Added retry logic for `Connection reset error - 104` | +| 0.2.28 | 2022-10-27 | [18428](https://github.com/airbytehq/airbyte/pull/18428) | Add retry logic for `Connection reset error - 104` | | 0.2.27 | 2022-10-26 | [18481](https://github.com/airbytehq/airbyte/pull/18481) | Fix check for wrong format | | 0.2.26 | 2022-10-18 | [18116](https://github.com/airbytehq/airbyte/pull/18116) | Transform Dropbox shared link | | 0.2.25 | 2022-10-14 | [17994](https://github.com/airbytehq/airbyte/pull/17994) | Handle `UnicodeDecodeError` during discover step. | @@ -212,7 +214,7 @@ In order to read large files from a remote location, this connector uses the [sm | 0.2.19 | 2022-08-19 | [15768](https://github.com/airbytehq/airbyte/pull/15768) | Convert 'nan' to 'null' | | 0.2.18 | 2022-08-16 | [15698](https://github.com/airbytehq/airbyte/pull/15698) | Cache binary stream to file for discover | | 0.2.17 | 2022-08-11 | [15501](https://github.com/airbytehq/airbyte/pull/15501) | Cache binary stream to file | -| 0.2.16 | 2022-08-10 | [15293](https://github.com/airbytehq/airbyte/pull/15293) | added support for encoding reader option | +| 0.2.16 | 2022-08-10 | [15293](https://github.com/airbytehq/airbyte/pull/15293) | Add support for encoding reader option | | 0.2.15 | 2022-08-05 | [15269](https://github.com/airbytehq/airbyte/pull/15269) | Bump `smart-open` version to 6.0.0 | | 0.2.12 | 2022-07-12 | [14535](https://github.com/airbytehq/airbyte/pull/14535) | Fix invalid schema generation for JSON files | | 0.2.11 | 2022-07-12 | [9974](https://github.com/airbytehq/airbyte/pull/14588) | Add support to YAML format |