diff --git a/encoding/csv.ts b/encoding/csv.ts new file mode 100644 index 00000000000000..3d50180cc87248 --- /dev/null +++ b/encoding/csv.ts @@ -0,0 +1,151 @@ +// Ported from Go: +// https://github.com/golang/go/blob/go1.12.5/src/encoding/csv/ +// Copyright 2018-2019 the Deno authors. All rights reserved. MIT license. + +import { BufReader, BufState } from "../io/bufio.ts"; +import { TextProtoReader } from "../textproto/mod.ts"; + +const INVALID_RUNE = ["\r", "\n", '"']; + +export class ParseError extends Error { + StartLine: number; + Line: number; + constructor(start: number, line: number, message: string) { + super(message); + this.StartLine = start; + this.Line = line; + } +} + +export interface ParseOptions { + comma: string; + comment?: string; + trimLeadingSpace: boolean; + lazyQuotes?: boolean; + fieldsPerRecord?: number; +} + +function chkOptions(opt: ParseOptions): Error | null { + if ( + INVALID_RUNE.includes(opt.comma) || + INVALID_RUNE.includes(opt.comment) || + opt.comma === opt.comment + ) { + return Error("Invalid Delimiter"); + } + return null; +} + +export async function read( + Startline: number, + reader: BufReader, + opt: ParseOptions = { comma: ",", comment: "#", trimLeadingSpace: false } +): Promise<[string[], BufState]> { + const tp = new TextProtoReader(reader); + let err: BufState; + let line: string; + let result: string[] = []; + let lineIndex = Startline; + + [line, err] = await tp.readLine(); + + // Normalize \r\n to \n on all input lines. + if ( + line.length >= 2 && + line[line.length - 2] === "\r" && + line[line.length - 1] === "\n" + ) { + line = line.substring(0, line.length - 2); + line = line + "\n"; + } + + const trimmedLine = line.trimLeft(); + if (trimmedLine.length === 0) { + return [[], err]; + } + + // line starting with comment character is ignored + if (opt.comment && trimmedLine[0] === opt.comment) { + return [result, err]; + } + + result = line.split(opt.comma); + + let quoteError = false; + result = result.map( + (r): string => { + if (opt.trimLeadingSpace) { + r = r.trimLeft(); + } + if (r[0] === '"' && r[r.length - 1] === '"') { + r = r.substring(1, r.length - 1); + } else if (r[0] === '"') { + r = r.substring(1, r.length); + } + + if (!opt.lazyQuotes) { + if (r[0] !== '"' && r.indexOf('"') !== -1) { + quoteError = true; + } + } + return r; + } + ); + if (quoteError) { + return [ + [], + new ParseError(Startline, lineIndex, 'bare " in non-quoted-field') + ]; + } + return [result, err]; +} + +export async function readAll( + reader: BufReader, + opt: ParseOptions = { + comma: ",", + trimLeadingSpace: false, + lazyQuotes: false + } +): Promise<[string[][], BufState]> { + const result: string[][] = []; + let _nbFields: number; + let err: BufState; + let lineResult: string[]; + let first = true; + let lineIndex = 0; + err = chkOptions(opt); + if (err) return [result, err]; + + for (;;) { + [lineResult, err] = await read(lineIndex, reader, opt); + if (err) break; + lineIndex++; + // If fieldsPerRecord is 0, Read sets it to + // the number of fields in the first record + if (first) { + first = false; + if (opt.fieldsPerRecord !== undefined) { + if (opt.fieldsPerRecord === 0) { + _nbFields = lineResult.length; + } else { + _nbFields = opt.fieldsPerRecord; + } + } + } + + if (lineResult.length > 0) { + if (_nbFields && _nbFields !== lineResult.length) { + return [ + null, + new ParseError(lineIndex, lineIndex, "wrong number of fields") + ]; + } + result.push(lineResult); + } + } + if (err !== "EOF") { + return [result, err]; + } + return [result, null]; +} diff --git a/encoding/csv_test.ts b/encoding/csv_test.ts new file mode 100644 index 00000000000000..1ca68ea1623493 --- /dev/null +++ b/encoding/csv_test.ts @@ -0,0 +1,460 @@ +// Test ported from Golang +// https://github.com/golang/go/blob/2cc15b1/src/encoding/csv/reader_test.go +import { test, runIfMain } from "../testing/mod.ts"; +import { assertEquals, assert } from "../testing/asserts.ts"; +import { readAll } from "./csv.ts"; +import { StringReader } from "../io/readers.ts"; +import { BufReader } from "../io/bufio.ts"; + +const ErrInvalidDelim = "Invalid Delimiter"; +const ErrFieldCount = "wrong number of fields"; +const ErrBareQuote = 'bare " in non-quoted-field'; + +// TODO(zekth): Activate remaining tests +const testCases = [ + { + Name: "Simple", + Input: "a,b,c\n", + Output: [["a", "b", "c"]] + }, + { + Name: "CRLF", + Input: "a,b\r\nc,d\r\n", + Output: [["a", "b"], ["c", "d"]] + }, + { + Name: "BareCR", + Input: "a,b\rc,d\r\n", + Output: [["a", "b\rc", "d"]] + }, + // { + // Name: "RFC4180test", + // Input: `#field1,field2,field3 + // "aaa","bbb","ccc" + // "a,a","bbb","ccc" + // zzz,yyy,xxx`, + // UseFieldsPerRecord: true, + // FieldsPerRecord: 0, + // Output: [ + // ["#field1", "field2", "field3"], + // ["aaa", "bbb", "ccc"], + // ["a,a", `bbb`, "ccc"], + // ["zzz", "yyy", "xxx"] + // ] + // }, + { + Name: "NoEOLTest", + Input: "a,b,c", + Output: [["a", "b", "c"]] + }, + { + Name: "Semicolon", + Input: "a;b;c\n", + Output: [["a", "b", "c"]], + Comma: ";" + }, + // { + // Name: "MultiLine", + // Input: `"two + // line","one line","three + // line + // field"`, + // Output: [["two\nline"], ["one line"], ["three\nline\nfield"]] + // }, + { + Name: "BlankLine", + Input: "a,b,c\n\nd,e,f\n\n", + Output: [["a", "b", "c"], ["d", "e", "f"]] + }, + { + Name: "BlankLineFieldCount", + Input: "a,b,c\n\nd,e,f\n\n", + Output: [["a", "b", "c"], ["d", "e", "f"]], + UseFieldsPerRecord: true, + FieldsPerRecord: 0 + }, + { + Name: "TrimSpace", + Input: " a, b, c\n", + Output: [["a", "b", "c"]], + TrimLeadingSpace: true + }, + { + Name: "LeadingSpace", + Input: " a, b, c\n", + Output: [[" a", " b", " c"]] + }, + { + Name: "Comment", + Input: "#1,2,3\na,b,c\n#comment", + Output: [["a", "b", "c"]], + Comment: "#" + }, + { + Name: "NoComment", + Input: "#1,2,3\na,b,c", + Output: [["#1", "2", "3"], ["a", "b", "c"]] + }, + { + Name: "LazyQuotes", + Input: `a "word","1"2",a","b`, + Output: [[`a "word"`, `1"2`, `a"`, `b`]], + LazyQuotes: true + }, + { + Name: "BareQuotes", + Input: `a "word","1"2",a"`, + Output: [[`a "word"`, `1"2`, `a"`]], + LazyQuotes: true + }, + { + Name: "BareDoubleQuotes", + Input: `a""b,c`, + Output: [[`a""b`, `c`]], + LazyQuotes: true + }, + { + Name: "BadDoubleQuotes", + Input: `a""b,c`, + Error: ErrBareQuote + // Error: &ParseError{StartLine: 1, Line: 1, Column: 1, Err: ErrBareQuote}, + }, + { + Name: "TrimQuote", + Input: ` "a"," b",c`, + Output: [["a", " b", "c"]], + TrimLeadingSpace: true + }, + { + Name: "BadBareQuote", + Input: `a "word","b"`, + Error: ErrBareQuote + // Error: true //&ParseError{StartLine: 1, Line: 1, Column: 2, Err: ErrBareQuote}, + }, + { + Name: "BadTrailingQuote", + Input: `"a word",b"`, + Error: ErrBareQuote + }, + { + Name: "ExtraneousQuote", + Input: `"a "word","b"`, + Error: ErrBareQuote + }, + { + Name: "BadFieldCount", + Input: "a,b,c\nd,e", + Error: ErrFieldCount, + UseFieldsPerRecord: true, + FieldsPerRecord: 0 + }, + { + Name: "BadFieldCount1", + Input: `a,b,c`, + // Error: &ParseError{StartLine: 1, Line: 1, Err: ErrFieldCount}, + UseFieldsPerRecord: true, + FieldsPerRecord: 2, + Error: ErrFieldCount + }, + { + Name: "FieldCount", + Input: "a,b,c\nd,e", + Output: [["a", "b", "c"], ["d", "e"]] + }, + { + Name: "TrailingCommaEOF", + Input: "a,b,c,", + Output: [["a", "b", "c", ""]] + }, + { + Name: "TrailingCommaEOL", + Input: "a,b,c,\n", + Output: [["a", "b", "c", ""]] + }, + { + Name: "TrailingCommaSpaceEOF", + Input: "a,b,c, ", + Output: [["a", "b", "c", ""]], + TrimLeadingSpace: true + }, + { + Name: "TrailingCommaSpaceEOL", + Input: "a,b,c, \n", + Output: [["a", "b", "c", ""]], + TrimLeadingSpace: true + }, + { + Name: "TrailingCommaLine3", + Input: "a,b,c\nd,e,f\ng,hi,", + Output: [["a", "b", "c"], ["d", "e", "f"], ["g", "hi", ""]], + TrimLeadingSpace: true + }, + { + Name: "NotTrailingComma3", + Input: "a,b,c, \n", + Output: [["a", "b", "c", " "]] + }, + { + Name: "CommaFieldTest", + Input: `x,y,z,w +x,y,z, +x,y,, +x,,, +,,, +"x","y","z","w" +"x","y","z","" +"x","y","","" +"x","","","" +"","","","" +`, + Output: [ + ["x", "y", "z", "w"], + ["x", "y", "z", ""], + ["x", "y", "", ""], + ["x", "", "", ""], + ["", "", "", ""], + ["x", "y", "z", "w"], + ["x", "y", "z", ""], + ["x", "y", "", ""], + ["x", "", "", ""], + ["", "", "", ""] + ] + }, + { + Name: "TrailingCommaIneffective1", + Input: "a,b,\nc,d,e", + Output: [["a", "b", ""], ["c", "d", "e"]], + TrimLeadingSpace: true + }, + { + Name: "ReadAllReuseRecord", + Input: "a,b\nc,d", + Output: [["a", "b"], ["c", "d"]], + ReuseRecord: true + }, + // { + // Name: "StartLine1", // Issue 19019 + // Input: 'a,"b\nc"d,e', + // Error: true + // // Error: &ParseError{StartLine: 1, Line: 2, Column: 1, Err: ErrQuote}, + // }, + // { + // Name: "StartLine2", + // Input: 'a,b\n"d\n\n,e', + // Error: true + // // Error: &ParseError{StartLine: 2, Line: 5, Column: 0, Err: ErrQuote}, + // }, + // { + // Name: "CRLFInQuotedField", // Issue 21201 + // Input: 'A,"Hello\r\nHi",B\r\n', + // Output: [["A", "Hello\nHi", "B"]] + // }, + { + Name: "BinaryBlobField", // Issue 19410 + Input: "x09\x41\xb4\x1c,aktau", + Output: [["x09A\xb4\x1c", "aktau"]] + }, + // { + // Name: "TrailingCR", + // Input: "field1,field2\r", + // Output: [["field1", "field2"]] + // }, + // { + // Name: "QuotedTrailingCR", + // Input: '"field"\r', + // Output: [['"field"']] + // }, + // { + // Name: "QuotedTrailingCRCR", + // Input: '"field"\r\r', + // Error: true, + // // Error: &ParseError{StartLine: 1, Line: 1, Column: 6, Err: ErrQuote}, + // }, + // { + // Name: "FieldCR", + // Input: "field\rfield\r", + // Output: [["field\rfield"]] + // }, + // { + // Name: "FieldCRCR", + // Input: "field\r\rfield\r\r", + // Output: [["field\r\rfield\r"]] + // }, + { + Name: "FieldCRCRLF", + Input: "field\r\r\nfield\r\r\n", + Output: [["field\r"], ["field\r"]] + }, + { + Name: "FieldCRCRLFCR", + Input: "field\r\r\n\rfield\r\r\n\r", + Output: [["field\r"], ["\rfield\r"]] + }, + // { + // Name: "FieldCRCRLFCRCR", + // Input: "field\r\r\n\r\rfield\r\r\n\r\r", + // Output: [["field\r"], ["\r\rfield\r"], ["\r"]] + // }, + // { + // Name: "MultiFieldCRCRLFCRCR", + // Input: "field1,field2\r\r\n\r\rfield1,field2\r\r\n\r\r,", + // Output: [["field1", "field2\r"], ["\r\rfield1", "field2\r"], ["\r\r", ""]] + // }, + { + Name: "NonASCIICommaAndComment", + Input: "a£b,c£ \td,e\n€ comment\n", + Output: [["a", "b,c", "d,e"]], + TrimLeadingSpace: true, + Comma: "£", + Comment: "€" + }, + { + Name: "NonASCIICommaAndCommentWithQuotes", + Input: 'a€" b,"€ c\nλ comment\n', + Output: [["a", " b,", " c"]], + Comma: "€", + Comment: "λ" + }, + { + // λ and θ start with the same byte. + // This tests that the parser doesn't confuse such characters. + Name: "NonASCIICommaConfusion", + Input: '"abθcd"λefθgh', + Output: [["abθcd", "efθgh"]], + Comma: "λ", + Comment: "€" + }, + { + Name: "NonASCIICommentConfusion", + Input: "λ\nλ\nθ\nλ\n", + Output: [["λ"], ["λ"], ["λ"]], + Comment: "θ" + }, + // { + // Name: "QuotedFieldMultipleLF", + // Input: '"\n\n\n\n"', + // Output: [["\n\n\n\n"]] + // }, + // { + // Name: "MultipleCRLF", + // Input: "\r\n\r\n\r\n\r\n" + // }, + // { + // // The implementation may read each line in several chunks if it doesn't fit entirely + // // in the read buffer, so we should test the code to handle that condition. + // Name: "HugeLines", + // Input: strings.Repeat("#ignore\n", 10000) + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000), + // Output: [[strings.Repeat("@", 5000), strings.Repeat("*", 5000)]], + // Comment: '#', + // }, + { + Name: "QuoteWithTrailingCRLF", + Input: '"foo"bar"\r\n', + Error: ErrBareQuote + // Error: &ParseError{StartLine: 1, Line: 1, Column: 4, Err: ErrQuote}, + }, + { + Name: "LazyQuoteWithTrailingCRLF", + Input: '"foo"bar"\r\n', + Output: [[`foo"bar`]], + LazyQuotes: true + }, + // { + // Name: "DoubleQuoteWithTrailingCRLF", + // Input: '"foo""bar"\r\n', + // Output: [[`foo"bar`]] + // }, + // { + // Name: "EvenQuotes", + // Input: `""""""""`, + // Output: [[`"""`]] + // }, + // { + // Name: "OddQuotes", + // Input: `"""""""`, + // Error: true + // // Error:" &ParseError{StartLine: 1, Line: 1, Column: 7, Err: ErrQuote}", + // }, + // { + // Name: "LazyOddQuotes", + // Input: `"""""""`, + // Output: [[`"""`]], + // LazyQuotes: true + // }, + { + Name: "BadComma1", + Comma: "\n", + Error: ErrInvalidDelim + }, + { + Name: "BadComma2", + Comma: "\r", + Error: ErrInvalidDelim + }, + { + Name: "BadComma3", + Comma: '"', + Error: ErrInvalidDelim + }, + { + Name: "BadComment1", + Comment: "\n", + Error: ErrInvalidDelim + }, + { + Name: "BadComment2", + Comment: "\r", + Error: ErrInvalidDelim + }, + { + Name: "BadCommaComment", + Comma: "X", + Comment: "X", + Error: ErrInvalidDelim + } +]; +for (const t of testCases) { + test({ + name: `[CSV] ${t.Name}`, + async fn(): Promise { + let comma = ","; + let comment; + let fieldsPerRec; + let trim = false; + let lazyquote = false; + if (t.Comma) { + comma = t.Comma; + } + if (t.Comment) { + comment = t.Comment; + } + if (t.TrimLeadingSpace) { + trim = true; + } + if (t.UseFieldsPerRecord) { + fieldsPerRec = t.FieldsPerRecord; + } + if (t.LazyQuotes) { + lazyquote = t.LazyQuotes; + } + const actual = await readAll(new BufReader(new StringReader(t.Input)), { + comma: comma, + comment: comment, + trimLeadingSpace: trim, + fieldsPerRecord: fieldsPerRec, + lazyQuotes: lazyquote + }); + if (t.Error) { + assert(!!actual[1]); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const e: any = actual[1]; + assertEquals(e.message, t.Error); + } else { + const expected = [t.Output, null]; + assertEquals(actual, expected); + } + } + }); +} + +runIfMain(import.meta); diff --git a/encoding/test.ts b/encoding/test.ts index 4ee03572dc4821..e7f779c866f69c 100644 --- a/encoding/test.ts +++ b/encoding/test.ts @@ -1,2 +1,3 @@ // Copyright 2018-2019 the Deno authors. All rights reserved. MIT license. import "./toml_test.ts"; +import "./csv_test.ts";