From 79f23fb93979aa1ef3a9bbf7049b93d1e6a0e95a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=95=E0=A4=BE=E0=A4=B0=E0=A4=A4=E0=A5=8B=E0=A4=AB?= =?UTF-8?q?=E0=A5=8D=E0=A4=AB=E0=A5=87=E0=A4=B2=E0=A4=B8=E0=A5=8D=E0=A4=95?= =?UTF-8?q?=E0=A5=8D=E0=A4=B0=E0=A4=BF=E0=A4=AA=E0=A5=8D=E0=A4=9F=E2=84=A2?= Date: Wed, 18 Oct 2023 16:57:37 +0200 Subject: [PATCH] feat(Spreadsheet File Node): Improve CSV parsing (#7448) This adds support for 1. custom delimiters 2. reading offsets to avoid having to read a large CSV all at once 3. excluding byte-order-mark NODE-861 #7443 --- .../nodes/SpreadsheetFile/description.ts | 53 ++++++ .../test/SpreadsheetFile.test.ts | 31 +++- .../nodes/SpreadsheetFile/test/bom.csv | 2 + .../SpreadsheetFile/test/workflow.bom.json | 155 ++++++++++++++++++ .../nodes/SpreadsheetFile/test/workflow.json | 30 +++- .../v2/SpreadsheetFileV2.node.ts | 17 +- 6 files changed, 273 insertions(+), 15 deletions(-) create mode 100644 packages/nodes-base/nodes/SpreadsheetFile/test/bom.csv create mode 100644 packages/nodes-base/nodes/SpreadsheetFile/test/workflow.bom.json diff --git a/packages/nodes-base/nodes/SpreadsheetFile/description.ts b/packages/nodes-base/nodes/SpreadsheetFile/description.ts index 2f188613c964f..903245fc61db8 100644 --- a/packages/nodes-base/nodes/SpreadsheetFile/description.ts +++ b/packages/nodes-base/nodes/SpreadsheetFile/description.ts @@ -201,6 +201,59 @@ export const optionsProperties: INodeProperties[] = [ default: true, description: 'Whether the first row of the file contains the header names', }, + { + displayName: 'Delimiter', + name: 'delimiter', + type: 'string', + displayOptions: { + show: { + '/operation': ['fromFile'], + '/fileFormat': ['csv'], + }, + }, + default: ',', + description: 'Set the field delimiter', + }, + { + displayName: 'Starting Line', + name: 'fromLine', + type: 'number', + displayOptions: { + show: { + '/operation': ['fromFile'], + '/fileFormat': ['csv'], + }, + }, + default: 0, + description: 'Start handling records from the requested line number', + }, + { + displayName: 'Max Number of Rows to Load', + name: 'maxRowCount', + type: 'number', + displayOptions: { + show: { + '/operation': ['fromFile'], + '/fileFormat': ['csv'], + }, + }, + default: -1, + description: 'Stop handling records after the requested number of rows are read', + }, + { + displayName: 'Exclude Byte Order Mark (BOM)', + name: 'enableBOM', + type: 'boolean', + displayOptions: { + show: { + '/operation': ['fromFile'], + '/fileFormat': ['csv'], + }, + }, + default: false, + description: + 'Whether to detect and exclude the byte-order-mark from the CSV Input if present', + }, { displayName: 'Include Empty Cells', name: 'includeEmptyCells', diff --git a/packages/nodes-base/nodes/SpreadsheetFile/test/SpreadsheetFile.test.ts b/packages/nodes-base/nodes/SpreadsheetFile/test/SpreadsheetFile.test.ts index 4305f15716979..83dacb85c65ba 100644 --- a/packages/nodes-base/nodes/SpreadsheetFile/test/SpreadsheetFile.test.ts +++ b/packages/nodes-base/nodes/SpreadsheetFile/test/SpreadsheetFile.test.ts @@ -1,24 +1,28 @@ +import path from 'path'; +import type { IWorkflowBase } from 'n8n-workflow'; import * as Helpers from '@test/nodes/Helpers'; import type { WorkflowTestData } from '@test/nodes/types'; - import { executeWorkflow } from '@test/nodes/ExecuteWorkflow'; -import path from 'path'; describe('Execute Spreadsheet File Node', () => { beforeEach(async () => { await Helpers.initBinaryDataService(); }); - // replace workflow json 'Read Binary File' node's filePath to local file - const workflow = Helpers.readJsonFileSync('nodes/SpreadsheetFile/test/workflow.json'); - const node = workflow.nodes.find((n: any) => n.name === 'Read Binary File'); - node.parameters.filePath = path.join(__dirname, 'spreadsheet.csv'); + const loadWorkflow = (fileName: string, csvName: string) => { + const workflow = Helpers.readJsonFileSync( + `nodes/SpreadsheetFile/test/${fileName}`, + ); + const node = workflow.nodes.find((n) => n.name === 'Read Binary File'); + node!.parameters.fileSelector = path.join(__dirname, csvName); + return workflow; + }; const tests: WorkflowTestData[] = [ { description: 'execute workflow.json', input: { - workflowData: workflow, + workflowData: loadWorkflow('workflow.json', 'spreadsheet.csv'), }, output: { nodeData: { @@ -78,6 +82,7 @@ describe('Execute Spreadsheet File Node', () => { }, ], ], + 'Read CSV with Row Limit': [[{ json: { A: '1', B: '2', C: '3' } }]], 'Write To File CSV': [ [ { @@ -149,6 +154,18 @@ describe('Execute Spreadsheet File Node', () => { }, }, }, + { + description: 'execute workflow.bom.json', + input: { + workflowData: loadWorkflow('workflow.bom.json', 'bom.csv'), + }, + output: { + nodeData: { + 'Edit with BOM included': [[{ json: { X: null } }]], + 'Edit with BOM excluded': [[{ json: { X: '1' } }]], + }, + }, + }, ]; const nodeTypes = Helpers.setup(tests); diff --git a/packages/nodes-base/nodes/SpreadsheetFile/test/bom.csv b/packages/nodes-base/nodes/SpreadsheetFile/test/bom.csv new file mode 100644 index 0000000000000..9c60e4f480b2f --- /dev/null +++ b/packages/nodes-base/nodes/SpreadsheetFile/test/bom.csv @@ -0,0 +1,2 @@ +a,b,c +1,2,3 diff --git a/packages/nodes-base/nodes/SpreadsheetFile/test/workflow.bom.json b/packages/nodes-base/nodes/SpreadsheetFile/test/workflow.bom.json new file mode 100644 index 0000000000000..2f6d22457dacb --- /dev/null +++ b/packages/nodes-base/nodes/SpreadsheetFile/test/workflow.bom.json @@ -0,0 +1,155 @@ +{ + "nodes": [ + { + "parameters": {}, + "id": "40bf604f-19f9-43e7-8bbb-74c36925f154", + "name": "When clicking \"Execute Workflow\"", + "type": "n8n-nodes-base.manualTrigger", + "typeVersion": 1, + "position": [ + -320, + 1040 + ] + }, + { + "parameters": { + "fileSelector": "bom.csv" + }, + "id": "623ea890-8882-4273-973e-834652d823b5", + "name": "Read Binary File", + "type": "n8n-nodes-base.readBinaryFiles", + "typeVersion": 1, + "position": [ + -100, + 1040 + ] + }, + { + "parameters": { + "fileFormat": "csv", + "options": { + "enableBOM": true + } + }, + "id": "c8cca5fb-e119-4ca1-a597-4f051a7f64ea", + "name": "Exclude BOM", + "type": "n8n-nodes-base.spreadsheetFile", + "typeVersion": 2, + "position": [ + 120, + 960 + ] + }, + { + "parameters": { + "fileFormat": "csv", + "options": { + "enableBOM": false + } + }, + "id": "56ec11dc-966b-4d06-b8c0-61475b30333d", + "name": "Include BOM", + "type": "n8n-nodes-base.spreadsheetFile", + "typeVersion": 2, + "position": [ + 120, + 1180 + ] + }, + { + "parameters": { + "fields": { + "values": [ + { + "name": "X", + "stringValue": "={{ $json.a }}" + } + ] + }, + "include": "none", + "options": {} + }, + "id": "6f6bccf2-d674-4774-9df9-6f6fd893bace", + "name": "Edit with BOM excluded", + "type": "n8n-nodes-base.set", + "typeVersion": 3.2, + "position": [ + 320, + 960 + ] + }, + { + "parameters": { + "fields": { + "values": [ + { + "name": "X", + "stringValue": "={{ $json.a }}" + } + ] + }, + "include": "none", + "options": {} + }, + "id": "27ca5cde-19cb-4bf2-9ab4-7f7e77ad01bd", + "name": "Edit with BOM included", + "type": "n8n-nodes-base.set", + "typeVersion": 3.2, + "position": [ + 320, + 1180 + ] + } + ], + "connections": { + "When clicking \"Execute Workflow\"": { + "main": [ + [ + { + "node": "Read Binary File", + "type": "main", + "index": 0 + } + ] + ] + }, + "Exclude BOM": { + "main": [ + [ + { + "node": "Edit with BOM excluded", + "type": "main", + "index": 0 + } + ] + ] + }, + "Include BOM": { + "main": [ + [ + { + "node": "Edit with BOM included", + "type": "main", + "index": 0 + } + ] + ] + }, + "Read Binary File": { + "main": [ + [ + { + "node": "Exclude BOM", + "type": "main", + "index": 0 + }, + { + "node": "Include BOM", + "type": "main", + "index": 0 + } + ] + ] + } + } +} diff --git a/packages/nodes-base/nodes/SpreadsheetFile/test/workflow.json b/packages/nodes-base/nodes/SpreadsheetFile/test/workflow.json index b52c7918bd411..5f1ce7c152efd 100644 --- a/packages/nodes-base/nodes/SpreadsheetFile/test/workflow.json +++ b/packages/nodes-base/nodes/SpreadsheetFile/test/workflow.json @@ -1,7 +1,4 @@ { - "meta": { - "instanceId": "104a4d08d8897b8bdeb38aaca515021075e0bd8544c983c2bb8c86e6a8e6081c" - }, "nodes": [ { "parameters": {}, @@ -29,11 +26,11 @@ }, { "parameters": { - "filePath": "C:\\Users\\spech\\Documents\\GitHub\\n8n-master\\packages\\nodes-base\\nodes\\SpreadsheetFile\\test\\spreadsheet.csv" + "fileSelector": "spreadsheet.csv" }, "id": "d7620053-eb3d-43dd-b2cd-d60d9a08a9cc", "name": "Read Binary File", - "type": "n8n-nodes-base.readBinaryFile", + "type": "n8n-nodes-base.readBinaryFiles", "typeVersion": 1, "position": [ 840, @@ -173,6 +170,22 @@ 1060, 940 ] + }, + { + "parameters": { + "fileFormat": "csv", + "options": { + "maxRowCount": 1 + } + }, + "id": "de905389-a11b-4dd8-8416-14d650804445", + "name": "Read CSV with Row Limit", + "type": "n8n-nodes-base.spreadsheetFile", + "typeVersion": 2, + "position": [ + -60, + 1340 + ] } ], "connections": { @@ -245,9 +258,14 @@ "node": "Read From File Read as String", "type": "main", "index": 0 + }, + { + "node": "Read CSV with Row Limit", + "type": "main", + "index": 0 } ] ] } } -} \ No newline at end of file +} diff --git a/packages/nodes-base/nodes/SpreadsheetFile/v2/SpreadsheetFileV2.node.ts b/packages/nodes-base/nodes/SpreadsheetFile/v2/SpreadsheetFileV2.node.ts index 41d4f5ec10d4a..360b749cdfa03 100644 --- a/packages/nodes-base/nodes/SpreadsheetFile/v2/SpreadsheetFileV2.node.ts +++ b/packages/nodes-base/nodes/SpreadsheetFile/v2/SpreadsheetFileV2.node.ts @@ -1,5 +1,4 @@ /* eslint-disable n8n-nodes-base/node-filename-against-convention */ -import { pipeline } from 'stream/promises'; import type { IDataObject, IExecuteFunctions, @@ -85,7 +84,12 @@ export class SpreadsheetFileV2 implements INodeType { } if (fileFormat === 'csv') { + const maxRowCount = options.maxRowCount as number; const parser = createCSVParser({ + delimiter: options.delimiter as string, + fromLine: options.fromLine as number, + bom: options.enableBOM as boolean, + to: maxRowCount > -1 ? maxRowCount : undefined, columns: options.headerRow !== false, onRecord: (record) => { rows.push(record); @@ -93,9 +97,18 @@ export class SpreadsheetFileV2 implements INodeType { }); if (binaryData.id) { const stream = await this.helpers.getBinaryStream(binaryData.id); - await pipeline(stream, parser); + await new Promise(async (resolve, reject) => { + parser.on('error', reject); + parser.on('readable', () => { + stream.unpipe(parser); + stream.destroy(); + resolve(); + }); + stream.pipe(parser); + }); } else { parser.write(binaryData.data, BINARY_ENCODING); + parser.end(); } } else { let workbook: WorkBook;