diff --git a/CHANGELOG.md b/CHANGELOG.md index d25fdb9..af3e0d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +5.3.4 / 11.06.2022 +================== + +* Added an optional `reason?: string` property of a with-schema parsing error. + 5.3.3 / 24.05.2022 ================== diff --git a/README.md b/README.md index 6c75e6f..fc424a3 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,7 @@ const schema = { } readXlsxFile(file, { schema }).then(({ rows, errors }) => { - // `errors` list items have shape: `{ row, column, error, value }`. + // `errors` list items have shape: `{ row, column, error, reason?, value?, type? }`. errors.length === 0 rows === [{ @@ -241,27 +241,34 @@ const { rows, errors } = convertToJson(data, schema) ```js import { parseExcelDate } from 'read-excel-file' -function ParseExcelError({ children: error }) { - // Get a human-readable value. - let value = error.value - if (error.type === Date) { - value = parseExcelDate(value).toString() - } - // Render error summary. +function ParseExcelError({ children }) { + const { type, value, error, reason, row, column } = children + + // Error summary. return (
- "{error.error}" + "{error}" + {reason && ' '} + {reason && ("{reason}")} {' for value '} - "{value}" + {stringifyValue(value)} {' in column '} - "{error.column}" - {error.type && ' of type '} - {error.type && "{error.type.name}"} + "{column}" + {type && type.name && ' of type '} + {type && type.name && "{type.name}"} {' in row '} - "{error.row}" + {row}
) } + +function stringifyValue(value) { + // Wrap strings in quotes. + if (typeof value === 'string') { + return '"' + value + '"' + } + return String(value) +} ``` diff --git a/package-lock.json b/package-lock.json index d0f9f40..6ade8e0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "read-excel-file", - "version": "5.3.3", + "version": "5.3.4", "lockfileVersion": 1, "requires": true, "dependencies": { diff --git a/package.json b/package.json index e5f912b..e1f2bd3 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "read-excel-file", - "version": "5.3.3", + "version": "5.3.4", "description": "Read small to medium `*.xlsx` files in a browser or Node.js. Parse to JSON with a strict schema.", "module": "index.js", "main": "index.cjs", diff --git a/source/read/isDateTimestamp.js b/source/read/isDateTimestamp.js new file mode 100644 index 0000000..ca1cb4d --- /dev/null +++ b/source/read/isDateTimestamp.js @@ -0,0 +1,118 @@ +// XLSX does have "d" type for dates, but it's not commonly used. +// Instead, it prefers using "n" type for storing dates as timestamps. +// +// Whether a numeric value is a number or a date timestamp, it sometimes could be +// detected by looking at the value "format" and seeing if it's a date-specific one. +// https://github.com/catamphetamine/read-excel-file/issues/3#issuecomment-395770777 +// +// The list of generic numeric value "formats": +// https://xlsxwriter.readthedocs.io/format.html#format-set-num-format +// +export default function isDateTimestamp(value, styleId, styles, options) { + if (styleId) { + const style = styles[styleId] + if (!style) { + throw new Error(`Cell style not found: ${styleId}`) + } + if ( + // Whether it's a "number format" that's conventionally used for storing date timestamps. + BUILT_IN_DATE_NUMBER_FORMAT_IDS.indexOf(parseInt(style.numberFormat.id)) >= 0 || + // Whether it's a "number format" that uses a "formatting template" + // that the developer is certain is a date formatting template. + (options.dateFormat && style.numberFormat.template === options.dateFormat) || + // Whether the "smart formatting template" feature is not disabled + // and it has detected that it's a date formatting template by looking at it. + (options.smartDateParser !== false && style.numberFormat.template && isDateTemplate(style.numberFormat.template)) + ) { + return true + } + } +} + +// https://hexdocs.pm/xlsxir/number_styles.html +const BUILT_IN_DATE_NUMBER_FORMAT_IDS = [14,15,16,17,18,19,20,21,22,27,30,36,45,46,47,50,57] + +// On some date formats, there's an "[$-414]" prefix. +// I don't have any idea what that is. +// +// https://stackoverflow.com/questions/4730152/what-indicates-an-office-open-xml-cell-contains-a-date-time-value +// +// Examples: +// +// * 27 (built-in format) "[$-404]e/m/d" +// * 164 (custom format) "[$-414]mmmm\ yyyy;@" +// +const DATE_FORMAT_WEIRD_PREFIX = /^\[\$-414\]/ + +// On some date formats, there's an ";@" postfix. +// I don't have any idea what that is. +// Examples: +// +// * 164 (custom format) "m/d/yyyy;@" +// * 164 (custom format) "[$-414]mmmm\ yyyy;@" +// +const DATE_FORMAT_WEIRD_POSTFIX = /;@$/ + +function isDateTemplate(template) { + // Date format tokens could be in upper case or in lower case. + // There seems to be no single standard. + // So lowercase the template first. + template = template.toLowerCase() + + // On some date formats, there's an "[$-414]" prefix. + // I don't have any idea what that is. Trim it. + template = template.replace(DATE_FORMAT_WEIRD_PREFIX, '') + + // On some date formats, there's an ";@" postfix. + // I don't have any idea what that is. Trim it. + template = template.replace(DATE_FORMAT_WEIRD_POSTFIX, '') + + const tokens = template.split(/\W+/) + for (const token of tokens) { + if (DATE_TEMPLATE_TOKENS.indexOf(token) < 0) { + return false + } + } + return true +} + +// These tokens could be in upper case or in lower case. +// There seems to be no single standard, so using lower case. +const DATE_TEMPLATE_TOKENS = [ + // Seconds (min two digits). Example: "05". + 'ss', + // Minutes (min two digits). Example: "05". Could also be "Months". Weird. + 'mm', + // Hours. Example: "1". + 'h', + // Hours (min two digits). Example: "01". + 'hh', + // "AM" part of "AM/PM". Lowercased just in case. + 'am', + // "PM" part of "AM/PM". Lowercased just in case. + 'pm', + // Day. Example: "1" + 'd', + // Day (min two digits). Example: "01" + 'dd', + // Month (numeric). Example: "1". + 'm', + // Month (numeric, min two digits). Example: "01". Could also be "Minutes". Weird. + 'mm', + // Month (shortened month name). Example: "Jan". + 'mmm', + // Month (full month name). Example: "January". + 'mmmm', + // Two-digit year. Example: "20". + 'yy', + // Full year. Example: "2020". + 'yyyy', + + // I don't have any idea what "e" means. + // It's used in "built-in" XLSX formats: + // * 27 '[$-404]e/m/d'; + // * 36 '[$-404]e/m/d'; + // * 50 '[$-404]e/m/d'; + // * 57 '[$-404]e/m/d'; + 'e' +]; \ No newline at end of file diff --git a/source/read/parseCell.js b/source/read/parseCell.js index b99e6a4..521649d 100644 --- a/source/read/parseCell.js +++ b/source/read/parseCell.js @@ -9,6 +9,10 @@ import { getCellInlineStringValue } from '../xml/xlsx.js' +import { + getOuterXml +} from '../xml/dom.js' + // Example of a ``ell element: // // @@ -54,6 +58,7 @@ export default function parseCell(node, sheet, xml, values, styles, properties, column: coords[1], value: parseCellValue(value, type, { getInlineStringValue: () => getCellInlineStringValue(sheet, node), + getInlineStringXml: () => getOuterXml(node), getStyleId: () => node.getAttribute('s'), styles, values, diff --git a/source/read/parseCellValue.js b/source/read/parseCellValue.js index f37c08d..eee405a 100644 --- a/source/read/parseCellValue.js +++ b/source/read/parseCellValue.js @@ -1,15 +1,15 @@ import parseDate from './parseDate.js' - -// https://hexdocs.pm/xlsxir/number_styles.html -const BUILT_IN_DATE_NUMBER_FORMAT_IDS = [14,15,16,17,18,19,20,21,22,27,30,36,45,46,47,50,57] - -export default function getCellValue(value, type, { - getInlineStringValue, - getStyleId, - styles, - values, - properties, - options +import isDateTimestamp from './isDateTimestamp.js' + +// Parses a string `value` of a cell. +export default function parseCellValue(value, type, { + getInlineStringValue, + getInlineStringXml, + getStyleId, + styles, + values, + properties, + options }) { if (!type) { // Default cell type is "n" (numeric). @@ -24,85 +24,103 @@ export default function getCellValue(value, type, { // http://webapp.docx4java.org/OnlineDemo/ecma376/SpreadsheetML/ST_CellType.html // switch (type) { - // If the cell contains formula string. + // XLSX tends to store all strings as "shared" (indexed) ones + // using "s" cell type (for saving on strage space). + // "str" cell type is then generally only used for storing + // formula-pre-calculated cell values. case 'str': value = parseString(value, options) break - // If the cell contains an "inline" (not "shared") string. + // Sometimes, XLSX stores strings as "inline" strings rather than "shared" (indexed) ones. + // Perhaps the specification doesn't force it to use one or another. + // Example: `Test 123`. case 'inlineStr': value = getInlineStringValue() if (value === undefined) { - throw new Error(`Unsupported "inline string" cell value structure`) // : ${cellNode.textContent}`) + throw new Error(`Unsupported "inline string" cell value structure: ${getInlineStringXml()}`) } value = parseString(value, options) break - // If the cell contains a "shared" string. + // XLSX tends to store string values as "shared" (indexed) ones. // "Shared" strings is a way for an Excel editor to reduce // the file size by storing "commonly used" strings in a dictionary // and then referring to such strings by their index in that dictionary. + // Example: `0`. case 's': // If a cell has no value then there's no `` element for it. // If a `` element exists then it's not empty. // The ``alue is a key in the "shared strings" dictionary of the // XLSX file, so look it up in the `values` dictionary by the numeric key. - value = values[Number(value)] + const sharedStringIndex = Number(value) + if (isNaN(sharedStringIndex)) { + throw new Error(`Invalid "shared" string index: ${value}`) + } + if (sharedStringIndex >= values.length) { + throw new Error(`An out-of-bounds "shared" string index: ${value}`) + } + value = values[sharedStringIndex] value = parseString(value, options) break + // Boolean (TRUE/FALSE) values are stored as either "1" or "0" + // in cells of type "b". case 'b': - value = value === '1' ? true : false + if (value === '1') { + value = true + } else if (value === '0') { + value = false + } else { + throw new Error(`Unsupported "boolean" cell value: ${value}`) + } break - // Stub: blank stub cell that is ignored by data processing utilities. + // XLSX specification seems to support cells of type "z": + // blank "stub" cells that should be ignored by data processing utilities. case 'z': value = undefined break - // Error: `value` is a numeric code. + // XLSX specification also defines cells of type "e" containing a numeric "error" code. + // It's not clear what that means though. // They also wrote: "and `w` property stores its common name". // It's unclear what they meant by that. case 'e': value = decodeError(value) break - // Date: a string to be parsed as a date. - // (usually a string in "ISO 8601" format) + // XLSX supports date cells of type "d", though seems like it (almost?) never + // uses it for storing dates, preferring "n" numeric timestamp cells instead. + // The value of a "d" cell is supposedly a string in "ISO 8601" format. + // I haven't seen an XLSX file having such cells. + // Example: `2021-06-10T00:47:45.700Z`. case 'd': if (value === undefined) { break } - value = new Date(value) + const parsedDate = new Date(value) + if (isNaN(parsedDate)) { + throw new Error(`Unsupported "date" cell value: ${value}`) + } + value = parsedDate break + // Numeric cells have type "n". case 'n': if (value === undefined) { break } - value = parseFloat(value) + const parsedNumber = Number(value) + if (isNaN(parsedNumber)) { + throw new Error(`Invalid "numeric" cell value: ${value}`) + } + value = parsedNumber // XLSX does have "d" type for dates, but it's not commonly used. - // Instead, spreadsheets prefer using "n" type for dates for some reason. - // - // In such cases, sometimes a "date" type could be heuristically detected - // by looking at such numeric value "format" and seeing if it's a date-specific one. - // https://github.com/catamphetamine/read-excel-file/issues/3#issuecomment-395770777 - // - // The list of generic numeric value "formats": - // https://xlsxwriter.readthedocs.io/format.html#format-set-num-format - // - const styleId = getStyleId() - if (styleId) { - // styleId = parseInt(styleId) - const style = styles[styleId] - if (!style) { - throw new Error(`Cell style not found: ${styleId}`) - } - if (BUILT_IN_DATE_NUMBER_FORMAT_IDS.indexOf(parseInt(style.numberFormat.id)) >= 0 || - (options.dateFormat && style.numberFormat.template === options.dateFormat) || - (options.smartDateParser !== false && style.numberFormat.template && isDateTemplate(style.numberFormat.template))) { - value = parseDate(value, properties) - } + // Instead, it prefers using "n" type for storing dates as timestamps. + if (isDateTimestamp(value, getStyleId(), styles, options)) { + // Parse the number as a date timestamp. + value = parseDate(value, properties) } break @@ -146,53 +164,6 @@ function decodeError(errorCode) { } } -function isDateTemplate(template) { - // Date format tokens could be in upper case or in lower case. - // There seems to be no single standard. - // So lowercase the template first. - template = template.toLowerCase() - const tokens = template.split(/\W+/) - for (const token of tokens) { - if (DATE_TEMPLATE_TOKENS.indexOf(token) < 0) { - return false - } - } - return true -} - -// These tokens could be in upper case or in lower case. -// There seems to be no single standard, so using lower case. -const DATE_TEMPLATE_TOKENS = [ - // Seconds (min two digits). Example: "05". - 'ss', - // Minutes (min two digits). Example: "05". Could also be "Months". Weird. - 'mm', - // Hours. Example: "1". - 'h', - // Hours (min two digits). Example: "01". - 'hh', - // "AM" part of "AM/PM". Lowercased just in case. - 'am', - // "PM" part of "AM/PM". Lowercased just in case. - 'pm', - // Day. Example: "1" - 'd', - // Day (min two digits). Example: "01" - 'dd', - // Month (numeric). Example: "1". - 'm', - // Month (numeric, min two digits). Example: "01". Could also be "Minutes". Weird. - 'mm', - // Month (shortened month name). Example: "Jan". - 'mmm', - // Month (full month name). Example: "January". - 'mmmm', - // Two-digit year. Example: "20". - 'yy', - // Full year. Example: "2020". - 'yyyy' -]; - function parseString(value, options) { // In some weird cases, a developer might want to disable // the automatic trimming of all strings. diff --git a/source/read/parseDate.js b/source/read/parseDate.js index b5d5c0e..5e8eb2b 100644 --- a/source/read/parseDate.js +++ b/source/read/parseDate.js @@ -1,67 +1,67 @@ - // Parses an Excel Date ("serial") into a corresponding javascript Date in UTC+0 timezone. - // (with time equal to 00:00) - // - // https://www.pcworld.com/article/3063622/software/mastering-excel-date-time-serial-numbers-networkdays-datevalue-and-more.html - // "If you need to calculate dates in your spreadsheets, - // Excel uses its own unique system, which it calls Serial Numbers". - // - export default function parseExcelDate(excelSerialDate, options) { - // https://support.microsoft.com/en-gb/help/214330/differences-between-the-1900-and-the-1904-date-system-in-excel - if (options && options.epoch1904) { - excelSerialDate += 1462 - } +// Parses an Excel Date ("serial") into a corresponding javascript Date in UTC+0 timezone. +// (with time equal to 00:00) +// +// https://www.pcworld.com/article/3063622/software/mastering-excel-date-time-serial-numbers-networkdays-datevalue-and-more.html +// "If you need to calculate dates in your spreadsheets, +// Excel uses its own unique system, which it calls Serial Numbers". +// +export default function parseExcelDate(excelSerialDate, options) { + // https://support.microsoft.com/en-gb/help/214330/differences-between-the-1900-and-the-1904-date-system-in-excel + if (options && options.epoch1904) { + excelSerialDate += 1462 + } - // "Excel serial date" is just - // the count of days since `01/01/1900` - // (seems that it may be even fractional). - // - // The count of days elapsed - // since `01/01/1900` (Excel epoch) - // till `01/01/1970` (Unix epoch). - // Accounts for leap years - // (19 of them, yielding 19 extra days). - const daysBeforeUnixEpoch = 70 * 365 + 19 + // "Excel serial date" is just + // the count of days since `01/01/1900` + // (seems that it may be even fractional). + // + // The count of days elapsed + // since `01/01/1900` (Excel epoch) + // till `01/01/1970` (Unix epoch). + // Accounts for leap years + // (19 of them, yielding 19 extra days). + const daysBeforeUnixEpoch = 70 * 365 + 19 - // An hour, approximately, because a minute - // may be longer than 60 seconds, due to "leap seconds". - // - // Still, Javascript `Date` (and UNIX time in general) intentionally - // drops the concept of "leap seconds" in order to make things simpler. - // So it's fine. - // https://stackoverflow.com/questions/53019726/where-are-the-leap-seconds-in-javascript - // - // "The JavaScript Date object specifically adheres to the concept of Unix Time - // (albeit with higher precision). This is part of the POSIX specification, - // and thus is sometimes called "POSIX Time". It does not count leap seconds, - // but rather assumes every day had exactly 86,400 seconds. You can read about - // this in section 20.3.1.1 of the current ECMAScript specification, which states: - // - // "Time is measured in ECMAScript in milliseconds since 01 January, 1970 UTC. - // In time values leap seconds are ignored. It is assumed that there are exactly - // 86,400,000 milliseconds per day." - // - // The fact is, that the unpredictable nature of leap seconds makes them very - // difficult to work with in APIs. One can't generally pass timestamps around - // that need leap seconds tables to be interpreted correctly, and expect that - // one system will interpret them the same as another. For example, while your - // example timestamp 1483228826 is 2017-01-01T00:00:00Z on your system, - // it would be interpreted as 2017-01-01T00:00:26Z on POSIX based systems, - // or systems without leap second tables. So they aren't portable. - // Even on systems that have full updated tables, there's no telling what those - // tables will contain in the future (beyond the 6-month IERS announcement period), - // so I can't produce a future timestamp without risk that it may eventually change. - // - // To be clear - to support leap seconds in a programming language, the implementation - // must go out of its way to do so, and must make tradeoffs that are not always acceptable. - // Though there are exceptions, the general position is to not support them - not because - // of any subversion or active countermeasures, but because supporting them properly is much, - // much harder." - // - // https://en.wikipedia.org/wiki/Unix_time#Leap_seconds - // https://en.wikipedia.org/wiki/Leap_year - // https://en.wikipedia.org/wiki/Leap_second - // - const hour = 60 * 60 * 1000 + // An hour, approximately, because a minute + // may be longer than 60 seconds, due to "leap seconds". + // + // Still, Javascript `Date` (and UNIX time in general) intentionally + // drops the concept of "leap seconds" in order to make things simpler. + // So it's fine. + // https://stackoverflow.com/questions/53019726/where-are-the-leap-seconds-in-javascript + // + // "The JavaScript Date object specifically adheres to the concept of Unix Time + // (albeit with higher precision). This is part of the POSIX specification, + // and thus is sometimes called "POSIX Time". It does not count leap seconds, + // but rather assumes every day had exactly 86,400 seconds. You can read about + // this in section 20.3.1.1 of the current ECMAScript specification, which states: + // + // "Time is measured in ECMAScript in milliseconds since 01 January, 1970 UTC. + // In time values leap seconds are ignored. It is assumed that there are exactly + // 86,400,000 milliseconds per day." + // + // The fact is, that the unpredictable nature of leap seconds makes them very + // difficult to work with in APIs. One can't generally pass timestamps around + // that need leap seconds tables to be interpreted correctly, and expect that + // one system will interpret them the same as another. For example, while your + // example timestamp 1483228826 is 2017-01-01T00:00:00Z on your system, + // it would be interpreted as 2017-01-01T00:00:26Z on POSIX based systems, + // or systems without leap second tables. So they aren't portable. + // Even on systems that have full updated tables, there's no telling what those + // tables will contain in the future (beyond the 6-month IERS announcement period), + // so I can't produce a future timestamp without risk that it may eventually change. + // + // To be clear - to support leap seconds in a programming language, the implementation + // must go out of its way to do so, and must make tradeoffs that are not always acceptable. + // Though there are exceptions, the general position is to not support them - not because + // of any subversion or active countermeasures, but because supporting them properly is much, + // much harder." + // + // https://en.wikipedia.org/wiki/Unix_time#Leap_seconds + // https://en.wikipedia.org/wiki/Leap_year + // https://en.wikipedia.org/wiki/Leap_second + // + const hour = 60 * 60 * 1000 - return new Date(Math.round((excelSerialDate - daysBeforeUnixEpoch) * 24 * hour)) - } \ No newline at end of file + return new Date(Math.round((excelSerialDate - daysBeforeUnixEpoch) * 24 * hour)) +} \ No newline at end of file diff --git a/source/read/schema/convertToJson.js b/source/read/schema/convertToJson.js index b3f76c2..0aa7ef7 100644 --- a/source/read/schema/convertToJson.js +++ b/source/read/schema/convertToJson.js @@ -76,6 +76,7 @@ function read(schema, row, rowIndex, columns, errors, options) { } let value let error + let reason if (isNestedSchema) { value = read(schemaEntry.type, row, rowIndex, columns, errors, options) } else { @@ -89,6 +90,7 @@ function read(schema, row, rowIndex, columns, errors, options) { if (result.error) { value = _value error = result.error + reason = result.reason } if (result.value !== null) { notEmpty = true @@ -101,6 +103,7 @@ function read(schema, row, rowIndex, columns, errors, options) { } else { const result = parseValue(rawValue, schemaEntry, options) error = result.error + reason = result.reason value = error ? rawValue : result.value } } @@ -114,6 +117,9 @@ function read(schema, row, rowIndex, columns, errors, options) { column: key, value } + if (reason) { + error.reason = reason + } if (schemaEntry.type) { error.type = schemaEntry.type } @@ -162,7 +168,7 @@ export function parseValue(value, schemaEntry, options) { } if (result.value !== null) { if (schemaEntry.oneOf && schemaEntry.oneOf.indexOf(result.value) < 0) { - return { error: 'invalid' } + return { error: 'invalid', reason: 'unknown' } } if (schemaEntry.validate) { try { @@ -197,7 +203,7 @@ function parseCustomValue(value, parse) { * Converts textual value to a javascript typed value. * @param {any} value * @param {} type - * @return {{ value: (string|number|Date|boolean), error: string }} + * @return {{ value: (string|number|Date|boolean), error: string, reason?: string }} */ function parseValueOfType(value, type, options) { switch (type) { @@ -205,43 +211,64 @@ function parseValueOfType(value, type, options) { if (typeof value === 'string') { return { value } } - // The global `isFinite()` function filters out: - // * NaN - // * -Infinity - // * Infinity - // All other values pass (including non-numbers). + // Excel tends to perform a forced automatic convertion of string-type values + // to number-type ones when the user has input them. Otherwise, users wouldn't + // be able to perform formula calculations on those cell values because users + // won't bother manually choosing a "numeric" cell type for each cell, and + // even if they did, choosing a "numeric" cell type every time wouldn't be an + // acceptable "user experience". + // + // So, if a cell value is supposed to be a string and Excel has automatically + // converted it to a number, perform a backwards conversion. + // if (typeof value === 'number') { + // The global `isFinite()` function filters out: + // * NaN + // * -Infinity + // * Infinity + // + // All other values pass (including non-numbers). + // if (isFinite(value)) { return { value: String(value) } } + return { error: 'invalid', reason: 'not_a_number' } } - return { error: 'invalid' } + return { error: 'invalid', reason: 'not_a_string' } case Number: case Integer: - // Convert strings to numbers. - // Just an additional feature. - // Won't happen when called from `readXlsx()`. + // An XLSX file editing software might not always correctly + // detect numeric values in string-type cells. Users won't bother + // manually selecting a cell type, so the editing software has to guess + // based on the user's input. One can assume that such auto-detection + // might not always work. + // + // So, if a cell is supposed to be a numeric one, convert a string value to a number. + // if (typeof value === 'string') { const stringifiedValue = value - value = parseFloat(value) + value = Number(value) if (String(value) !== stringifiedValue) { - return { error: 'invalid' } + return { error: 'invalid', reason: 'not_a_number_string' } } } else if (typeof value !== 'number') { - return { error: 'invalid' } + return { error: 'invalid', reason: 'not_a_number' } } + // At this point, `value` can only be a number. + // // The global `isFinite()` function filters out: // * NaN // * -Infinity // * Infinity + // // All other values pass (including non-numbers). - // At this point, `value` can only be a number. + // if (!isFinite(value)) { - return { error: 'invalid' } + return { error: 'invalid', reason: 'not_a_number' } } if (type === Integer && !isInteger(value)) { - return { error: 'invalid' } + return { error: 'invalid', reason: 'not_an_integer' } } return { value } @@ -250,42 +277,47 @@ function parseValueOfType(value, type, options) { if (isURL(value)) { return { value } } + return { error: 'invalid', reason: 'not_a_url' } } - return { error: 'invalid' } + return { error: 'invalid', reason: 'not_a_string' } case Email: if (typeof value === 'string') { if (isEmail(value)) { return { value } } + return { error: 'invalid', reason: 'not_an_email' } } - return { error: 'invalid' } + return { error: 'invalid', reason: 'not_a_string' } case Date: // XLSX has no specific format for dates. // Sometimes a date can be heuristically detected. // https://github.com/catamphetamine/read-excel-file/issues/3#issuecomment-395770777 if (value instanceof Date) { + if (isNaN(value)) { + return { error: 'invalid', reason: 'out_of_bounds' } + } return { value } } if (typeof value === 'number') { if (!isFinite(value)) { - return { error: 'invalid' } + return { error: 'invalid', reason: 'not_a_number' } } - value = parseInt(value) + value = Number(value) const date = parseDate(value, options.properties) - if (!date) { - return { error: 'invalid' } + if (isNaN(date)) { + return { error: 'invalid', reason: 'out_of_bounds' } } return { value: date } } - return { error: 'invalid' } + return { error: 'invalid', reason: 'not_a_number' } case Boolean: if (typeof value === 'boolean') { return { value } } - return { error: 'invalid' } + return { error: 'invalid', reason: 'not_a_boolean' } default: if (typeof type === 'function') { diff --git a/source/read/schema/convertToJson.test.js b/source/read/schema/convertToJson.test.js index c2706b7..ba2acc1 100644 --- a/source/read/schema/convertToJson.test.js +++ b/source/read/schema/convertToJson.test.js @@ -183,9 +183,14 @@ describe('convertToJson', () => { }) errors.length.should.equal(1) - errors[0].row.should.equal(2) - errors[0].column.should.equal('INTEGER') - errors[0].error.should.equal('invalid') + errors[0].should.deep.equal({ + error: 'invalid', + reason: 'not_an_integer', + row: 2, + column: 'INTEGER', + type: Integer, + value: '1.2' + }) rows.should.deep.equal([{ value: 1 @@ -294,6 +299,7 @@ describe('convertToJson', () => { errors.should.deep.equal([{ error: 'invalid', + reason: 'not_a_number_string', row: 1, column: 'NUMBER', type: Number, @@ -334,6 +340,7 @@ describe('convertToJson', () => { errors.should.deep.equal([{ error: 'invalid', + reason: 'not_a_boolean', row: 1, column: 'INVALID', type: Boolean, @@ -373,12 +380,14 @@ describe('convertToJson', () => { errors.should.deep.equal([{ error: 'invalid', + reason: 'not_a_number', row: 1, column: 'INVALID', type: Date, value: '-' }, { error: 'invalid', + reason: 'not_a_number', row: 2, column: 'INVALID', type: Date, @@ -449,6 +458,7 @@ describe('convertToJson', () => { errors.should.deep.equal([{ error: 'invalid', + reason: 'not_a_number_string', row: 6, column: 'NUMBER', type: Number, @@ -499,6 +509,7 @@ describe('convertToJson', () => { errors.should.deep.equal([{ error: 'invalid', + reason: 'unknown', row: 1, column: 'STATUS', type: String, diff --git a/source/xml/dom.js b/source/xml/dom.js index 2adc106..69c8eef 100644 --- a/source/xml/dom.js +++ b/source/xml/dom.js @@ -65,4 +65,33 @@ export function getTagName(element) { // when getting `.tagName`, so just replacing anything // before a colon, if any. return element.tagName.replace(NAMESPACE_REG_EXP, '') +} + +// This function is only used for occasional debug messages. +export function getOuterXml(node) { + // `nodeType: 1` means "Element". + // https://www.w3schools.com/xml/prop_element_nodetype.asp + if (node.nodeType !== 1) { + return node.textContent + } + + let xml = '<' + getTagName(node) + + let j = 0 + while (j < node.attributes.length) { + xml += ' ' + node.attributes[j].name + '=' + '"' + node.attributes[j].value + '"' + j++ + } + + xml += '>' + + let i = 0 + while (i < node.childNodes.length) { + xml += getOuterXml(node.childNodes[i]) + i++ + } + + xml += '' + + return xml } \ No newline at end of file diff --git a/source/xml/xlsx.js b/source/xml/xlsx.js index e53618c..0d557ab 100644 --- a/source/xml/xlsx.js +++ b/source/xml/xlsx.js @@ -3,6 +3,7 @@ import { findChild, findChildren, forEach, map, getTagName } from './dom.js' export function getCells(document) { const worksheet = document.documentElement const sheetData = findChild(worksheet, 'sheetData') + const cells = [] forEach(sheetData, 'row', (row) => { forEach(row, 'c', (cell) => { @@ -29,10 +30,12 @@ export function getCellValue(document, node) { } export function getCellInlineStringValue(document, node) { - if (node.firstChild && + if ( + node.firstChild && getTagName(node.firstChild) === 'is' && node.firstChild.firstChild && - getTagName(node.firstChild.firstChild) === 't') { + getTagName(node.firstChild.firstChild) === 't' + ) { return node.firstChild.firstChild.textContent } } diff --git a/source/xml/xml.js b/source/xml/xml.js index d1bd413..fd3f8dd 100644 --- a/source/xml/xml.js +++ b/source/xml/xml.js @@ -1,7 +1,7 @@ -import XMLDOM from '@xmldom/xmldom' +import { DOMParser } from '@xmldom/xmldom' export default { createDocument(content) { - return new XMLDOM.DOMParser().parseFromString(content) + return new DOMParser().parseFromString(content) } } \ No newline at end of file diff --git a/source/xml/xpath/README.md b/source/xml/xpath/README.md new file mode 100644 index 0000000..bcd9a66 --- /dev/null +++ b/source/xml/xpath/README.md @@ -0,0 +1,5 @@ +`xlsx-xpath.js` is an "alternative" implementation of `./xml/xlsx.js` functions using the [`XPath`](https://www.w3schools.com/xml/xpath_syntax.asp) XML document query language. + +`XPath` is no longer used in this project and has been substituted with a simpler set of functions defined in `./xml/dom.js` that're used in `./xml/xlsx.js`. + +The reason is that `xpathBrowser.js` turned out to be [not supported](https://github.com/catamphetamine/read-excel-file/issues/26) in Internet Explorer 11, and including a [polyfill](https://www.npmjs.com/package/xpath) for `XPath` (`xpathNode.js`) would increase the bundle size by about 100 kilobytes. \ No newline at end of file diff --git a/source/xml/xpath/xlsx-xpath.js b/source/xml/xpath/xlsx-xpath.js new file mode 100644 index 0000000..f4f2e9c --- /dev/null +++ b/source/xml/xpath/xlsx-xpath.js @@ -0,0 +1,84 @@ +// This file is no longer used. + +// Turns out IE11 doesn't support XPath, so not using `./xpathBrowser` for browsers. +// https://github.com/catamphetamine/read-excel-file/issues/26 +// The inclusion of `xpath` package in `./xpathNode` +// increases the bundle size by about 100 kilobytes. +// IE11 is a wide-spread browser and it's unlikely that +// anyone would ignore it for now. +// There could be a separate export `read-excel-file/ie11` +// for using `./xpathNode` instead of `./xpathBrowser` +// but this library has been migrated to not using `xpath` anyway. +// This code is just alternative/historical now, it seems. +import xpath from './xpathNode' + +const namespaces = { + a: 'http://schemas.openxmlformats.org/spreadsheetml/2006/main', + // This one seems to be for `r:id` attributes on ``s. + r: 'http://schemas.openxmlformats.org/officeDocument/2006/relationships', + // This one seems to be for `` file. + rr: 'http://schemas.openxmlformats.org/package/2006/relationships' +} + +export function getCells(document) { + return xpath(document, null, '/a:worksheet/a:sheetData/a:row/a:c', namespaces) +} + +export function getMergedCells(document) { + return xpath(document, null, '/a:worksheet/a:mergedCells/a:mergedCell/@ref', namespaces) +} + +export function getCellValue(document, node) { + return xpath(document, node, './a:v', namespaces)[0] +} + +export function getCellInlineStringValue(document, node) { + return xpath(document, node, './a:is/a:t', namespaces)[0].textContent +} + +export function getDimensions(document) { + const dimensions = xpath(document, null, '/a:worksheet/a:dimension/@ref', namespaces)[0] + if (dimensions) { + return dimensions.textContent + } +} + +export function getBaseStyles(document) { + return xpath(document, null, '/a:styleSheet/a:cellStyleXfs/a:xf', namespaces) +} + +export function getCellStyles(document) { + return xpath(document, null, '/a:styleSheet/a:cellXfs/a:xf', namespaces) +} + +export function getNumberFormats(document) { + return xpath(document, null, '/a:styleSheet/a:numFmts/a:numFmt', namespaces) +} + +export function getSharedStrings(document) { + // An `` element can contain a `` (simplest case) or a set of `` ("rich formatting") elements having ``. + // https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.spreadsheet.sharedstringitem?redirectedfrom=MSDN&view=openxml-2.8.1 + // http://www.datypic.com/sc/ooxml/e-ssml_si-1.html + +  // The ".//a:t[not(ancestor::a:rPh)]" selector means: +  // "select all `` that are not children of ``".  + // https://stackoverflow.com/questions/42773772/xpath-span-what-does-the-dot-mean +  // `` seems to be some "phonetic data" added for languages like Japanese that should be ignored. +  // https://github.com/doy/spreadsheet-parsexlsx/issues/72 +  return xpath(document, null, '/a:sst/a:si', namespaces) +    .map(string => xpath(document, string, './/a:t[not(ancestor::a:rPh)]', namespaces) +        .map(_ => _.textContent).join('') +    ) +} + +export function getWorkbookProperties(document) { + return xpath(document, null, '/a:workbook/a:workbookPr', namespaces)[0] +} + +export function getRelationships(document) { + return xpath(document, null, '/rr:Relationships/rr:Relationship', namespaces) +} + +export function getSheets(document) { + return xpath(document, null, '/a:workbook/a:sheets/a:sheet', namespaces) +} \ No newline at end of file diff --git a/source/xml/xpath/xpathBrowser.js b/source/xml/xpath/xpathBrowser.js new file mode 100644 index 0000000..aa69681 --- /dev/null +++ b/source/xml/xpath/xpathBrowser.js @@ -0,0 +1,29 @@ +// This file is no longer used. + +// Turns out IE11 doesn't support XPath, so not using `./xpathBrowser` for browsers. +// https://github.com/catamphetamine/read-excel-file/issues/26 +// The inclusion of `xpath` package in `./xpathNode` +// increases the bundle size by about 100 kilobytes. +// IE11 is a wide-spread browser and it's unlikely that +// anyone would ignore it for now. +// There could be a separate export `read-excel-file/ie11` +// for using `./xpathNode` instead of `./xpathBrowser` +// but this library has been migrated to not using `xpath` anyway. +// This code is just alternative/historical now, it seems. +export default function xpath(document, node, path, namespaces = {}) { + const nodes = document.evaluate( + path, + node || document, + prefix => namespaces[prefix], + XPathResult.ANY_TYPE, + null + ) + // Convert iterator to an array. + const results = [] + let result = nodes.iterateNext() + while (result) { + results.push(result) + result = nodes.iterateNext() + } + return results +} \ No newline at end of file diff --git a/source/xml/xpath/xpathNode.js b/source/xml/xpath/xpathNode.js new file mode 100644 index 0000000..338dc53 --- /dev/null +++ b/source/xml/xpath/xpathNode.js @@ -0,0 +1,8 @@ +// This file is no longer used. + +import xpath from 'xpath' + +export default function(document, node, path, namespaces = {}) { + const select = xpath.useNamespaces(namespaces) + return select(path, node || document) +} \ No newline at end of file diff --git a/test/test.test.js b/test/test.test.js index 0273ccc..bba6697 100644 --- a/test/test.test.js +++ b/test/test.test.js @@ -1,40 +1,29 @@ -import parseExcel from '../source/read/readXlsxFileNode.js' -import assert from 'assert' +import parseXlsx from '../source/read/readXlsxFileNode.js' -function parseXlsx(path, sheet, callback) { - if (typeof callback === 'undefined') { - callback = sheet; - sheet = '1'; - } - parseExcel(path, sheet).then((data) => callback(null, data), callback); -} +const sheetsDir = './test/spreadsheets' -var sheetsDir = './test/spreadsheets'; -var sheets = { +const sheets = { 'excel_mac_2011-basic.xlsx': [ [ 'One', 'Two' ], [ 'Three', 'Four' ] ], 'excel_mac_2011-formatting.xlsx': [ [ 'Hey', 'now', 'so' ], [ 'cool', null, null ] ], - 'excel_multiple_text_nodes.xlsx': [ [ 'id', 'memo' ], [ '1.0', 'abc def ghi' ], [ '2.0', 'pqr stu' ] ] -}; + 'excel_multiple_text_nodes.xlsx': [ [ 'id', 'memo' ], [ 1, 'abc def ghi' ], [ 2, 'pqr stu' ] ] +} -describe('excel.js', function() { - for (var filename in sheets) { +describe('read-excel-file', function() { + for (const filename in sheets) { + // Creates a javascript "closure". + // Otherwise, in every test, `expected` variable value would be equal + // to the last `for` cycle's `expected` variable value. (function(filename, expected) { - describe(filename + ' basic test', function() { - it('should return the right value', function(done) { - parseXlsx(sheetsDir + '/' + filename, function(err, data) { - assert.deepEqual(data, expected); - done(err); - }); + it('should return the right value', async function() { + const result = await parseXlsx(sheetsDir + '/' + filename) + expect(result).to.deep.equal(expected) }) - it('should return the right value with the sheet specified', function(done) { - parseXlsx(sheetsDir + '/' + filename, '1', function(err, data) { - assert.deepEqual(data, expected); - done(err); - }); + it('should return the right value with the sheet specified', async function() { + const result = await parseXlsx(sheetsDir + '/' + filename, '1') + expect(result).to.deep.equal(expected) }) - }); - - })(filename, sheets[filename]); + }) + })(filename, sheets[filename]) } -}); +}) \ No newline at end of file diff --git a/types.d.ts b/types.d.ts index 53502cf..42b02c5 100644 --- a/types.d.ts +++ b/types.d.ts @@ -43,6 +43,7 @@ export type Schema = Record export interface Error { error: string; + reason?: string; row: number; column: string; value?: any;