diff --git a/CHANGELOG.md b/CHANGELOG.md
index d25fdb9..af3e0d6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,8 @@
+5.3.4 / 11.06.2022
+==================
+
+* Added an optional `reason?: string` property of a with-schema parsing error.
+
5.3.3 / 24.05.2022
==================
diff --git a/README.md b/README.md
index 6c75e6f..fc424a3 100644
--- a/README.md
+++ b/README.md
@@ -157,7 +157,7 @@ const schema = {
}
readXlsxFile(file, { schema }).then(({ rows, errors }) => {
- // `errors` list items have shape: `{ row, column, error, value }`.
+ // `errors` list items have shape: `{ row, column, error, reason?, value?, type? }`.
errors.length === 0
rows === [{
@@ -241,27 +241,34 @@ const { rows, errors } = convertToJson(data, schema)
```js
import { parseExcelDate } from 'read-excel-file'
-function ParseExcelError({ children: error }) {
- // Get a human-readable value.
- let value = error.value
- if (error.type === Date) {
- value = parseExcelDate(value).toString()
- }
- // Render error summary.
+function ParseExcelError({ children }) {
+ const { type, value, error, reason, row, column } = children
+
+ // Error summary.
return (
- "{error.error}"
+ "{error}"
+ {reason && ' '}
+ {reason && ("{reason}")
}
{' for value '}
- "{value}"
+ {stringifyValue(value)}
{' in column '}
- "{error.column}"
- {error.type && ' of type '}
- {error.type && "{error.type.name}"
}
+ "{column}"
+ {type && type.name && ' of type '}
+ {type && type.name && "{type.name}"
}
{' in row '}
- "{error.row}"
+ {row}
)
}
+
+function stringifyValue(value) {
+ // Wrap strings in quotes.
+ if (typeof value === 'string') {
+ return '"' + value + '"'
+ }
+ return String(value)
+}
```
diff --git a/package-lock.json b/package-lock.json
index d0f9f40..6ade8e0 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,6 +1,6 @@
{
"name": "read-excel-file",
- "version": "5.3.3",
+ "version": "5.3.4",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
diff --git a/package.json b/package.json
index e5f912b..e1f2bd3 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "read-excel-file",
- "version": "5.3.3",
+ "version": "5.3.4",
"description": "Read small to medium `*.xlsx` files in a browser or Node.js. Parse to JSON with a strict schema.",
"module": "index.js",
"main": "index.cjs",
diff --git a/source/read/isDateTimestamp.js b/source/read/isDateTimestamp.js
new file mode 100644
index 0000000..ca1cb4d
--- /dev/null
+++ b/source/read/isDateTimestamp.js
@@ -0,0 +1,118 @@
+// XLSX does have "d" type for dates, but it's not commonly used.
+// Instead, it prefers using "n" type for storing dates as timestamps.
+//
+// Whether a numeric value is a number or a date timestamp, it sometimes could be
+// detected by looking at the value "format" and seeing if it's a date-specific one.
+// https://github.com/catamphetamine/read-excel-file/issues/3#issuecomment-395770777
+//
+// The list of generic numeric value "formats":
+// https://xlsxwriter.readthedocs.io/format.html#format-set-num-format
+//
+export default function isDateTimestamp(value, styleId, styles, options) {
+ if (styleId) {
+ const style = styles[styleId]
+ if (!style) {
+ throw new Error(`Cell style not found: ${styleId}`)
+ }
+ if (
+ // Whether it's a "number format" that's conventionally used for storing date timestamps.
+ BUILT_IN_DATE_NUMBER_FORMAT_IDS.indexOf(parseInt(style.numberFormat.id)) >= 0 ||
+ // Whether it's a "number format" that uses a "formatting template"
+ // that the developer is certain is a date formatting template.
+ (options.dateFormat && style.numberFormat.template === options.dateFormat) ||
+ // Whether the "smart formatting template" feature is not disabled
+ // and it has detected that it's a date formatting template by looking at it.
+ (options.smartDateParser !== false && style.numberFormat.template && isDateTemplate(style.numberFormat.template))
+ ) {
+ return true
+ }
+ }
+}
+
+// https://hexdocs.pm/xlsxir/number_styles.html
+const BUILT_IN_DATE_NUMBER_FORMAT_IDS = [14,15,16,17,18,19,20,21,22,27,30,36,45,46,47,50,57]
+
+// On some date formats, there's an "[$-414]" prefix.
+// I don't have any idea what that is.
+//
+// https://stackoverflow.com/questions/4730152/what-indicates-an-office-open-xml-cell-contains-a-date-time-value
+//
+// Examples:
+//
+// * 27 (built-in format) "[$-404]e/m/d"
+// * 164 (custom format) "[$-414]mmmm\ yyyy;@"
+//
+const DATE_FORMAT_WEIRD_PREFIX = /^\[\$-414\]/
+
+// On some date formats, there's an ";@" postfix.
+// I don't have any idea what that is.
+// Examples:
+//
+// * 164 (custom format) "m/d/yyyy;@"
+// * 164 (custom format) "[$-414]mmmm\ yyyy;@"
+//
+const DATE_FORMAT_WEIRD_POSTFIX = /;@$/
+
+function isDateTemplate(template) {
+ // Date format tokens could be in upper case or in lower case.
+ // There seems to be no single standard.
+ // So lowercase the template first.
+ template = template.toLowerCase()
+
+ // On some date formats, there's an "[$-414]" prefix.
+ // I don't have any idea what that is. Trim it.
+ template = template.replace(DATE_FORMAT_WEIRD_PREFIX, '')
+
+ // On some date formats, there's an ";@" postfix.
+ // I don't have any idea what that is. Trim it.
+ template = template.replace(DATE_FORMAT_WEIRD_POSTFIX, '')
+
+ const tokens = template.split(/\W+/)
+ for (const token of tokens) {
+ if (DATE_TEMPLATE_TOKENS.indexOf(token) < 0) {
+ return false
+ }
+ }
+ return true
+}
+
+// These tokens could be in upper case or in lower case.
+// There seems to be no single standard, so using lower case.
+const DATE_TEMPLATE_TOKENS = [
+ // Seconds (min two digits). Example: "05".
+ 'ss',
+ // Minutes (min two digits). Example: "05". Could also be "Months". Weird.
+ 'mm',
+ // Hours. Example: "1".
+ 'h',
+ // Hours (min two digits). Example: "01".
+ 'hh',
+ // "AM" part of "AM/PM". Lowercased just in case.
+ 'am',
+ // "PM" part of "AM/PM". Lowercased just in case.
+ 'pm',
+ // Day. Example: "1"
+ 'd',
+ // Day (min two digits). Example: "01"
+ 'dd',
+ // Month (numeric). Example: "1".
+ 'm',
+ // Month (numeric, min two digits). Example: "01". Could also be "Minutes". Weird.
+ 'mm',
+ // Month (shortened month name). Example: "Jan".
+ 'mmm',
+ // Month (full month name). Example: "January".
+ 'mmmm',
+ // Two-digit year. Example: "20".
+ 'yy',
+ // Full year. Example: "2020".
+ 'yyyy',
+
+ // I don't have any idea what "e" means.
+ // It's used in "built-in" XLSX formats:
+ // * 27 '[$-404]e/m/d';
+ // * 36 '[$-404]e/m/d';
+ // * 50 '[$-404]e/m/d';
+ // * 57 '[$-404]e/m/d';
+ 'e'
+];
\ No newline at end of file
diff --git a/source/read/parseCell.js b/source/read/parseCell.js
index b99e6a4..521649d 100644
--- a/source/read/parseCell.js
+++ b/source/read/parseCell.js
@@ -9,6 +9,10 @@ import {
getCellInlineStringValue
} from '../xml/xlsx.js'
+import {
+ getOuterXml
+} from '../xml/dom.js'
+
// Example of a ``ell element:
//
//
@@ -54,6 +58,7 @@ export default function parseCell(node, sheet, xml, values, styles, properties,
column: coords[1],
value: parseCellValue(value, type, {
getInlineStringValue: () => getCellInlineStringValue(sheet, node),
+ getInlineStringXml: () => getOuterXml(node),
getStyleId: () => node.getAttribute('s'),
styles,
values,
diff --git a/source/read/parseCellValue.js b/source/read/parseCellValue.js
index f37c08d..eee405a 100644
--- a/source/read/parseCellValue.js
+++ b/source/read/parseCellValue.js
@@ -1,15 +1,15 @@
import parseDate from './parseDate.js'
-
-// https://hexdocs.pm/xlsxir/number_styles.html
-const BUILT_IN_DATE_NUMBER_FORMAT_IDS = [14,15,16,17,18,19,20,21,22,27,30,36,45,46,47,50,57]
-
-export default function getCellValue(value, type, {
- getInlineStringValue,
- getStyleId,
- styles,
- values,
- properties,
- options
+import isDateTimestamp from './isDateTimestamp.js'
+
+// Parses a string `value` of a cell.
+export default function parseCellValue(value, type, {
+ getInlineStringValue,
+ getInlineStringXml,
+ getStyleId,
+ styles,
+ values,
+ properties,
+ options
}) {
if (!type) {
// Default cell type is "n" (numeric).
@@ -24,85 +24,103 @@ export default function getCellValue(value, type, {
// http://webapp.docx4java.org/OnlineDemo/ecma376/SpreadsheetML/ST_CellType.html
//
switch (type) {
- // If the cell contains formula string.
+ // XLSX tends to store all strings as "shared" (indexed) ones
+ // using "s" cell type (for saving on strage space).
+ // "str" cell type is then generally only used for storing
+ // formula-pre-calculated cell values.
case 'str':
value = parseString(value, options)
break
- // If the cell contains an "inline" (not "shared") string.
+ // Sometimes, XLSX stores strings as "inline" strings rather than "shared" (indexed) ones.
+ // Perhaps the specification doesn't force it to use one or another.
+ // Example: `Test 123
`.
case 'inlineStr':
value = getInlineStringValue()
if (value === undefined) {
- throw new Error(`Unsupported "inline string" cell value structure`) // : ${cellNode.textContent}`)
+ throw new Error(`Unsupported "inline string" cell value structure: ${getInlineStringXml()}`)
}
value = parseString(value, options)
break
- // If the cell contains a "shared" string.
+ // XLSX tends to store string values as "shared" (indexed) ones.
// "Shared" strings is a way for an Excel editor to reduce
// the file size by storing "commonly used" strings in a dictionary
// and then referring to such strings by their index in that dictionary.
+ // Example: `0
`.
case 's':
// If a cell has no value then there's no `` element for it.
// If a `` element exists then it's not empty.
// The ``alue is a key in the "shared strings" dictionary of the
// XLSX file, so look it up in the `values` dictionary by the numeric key.
- value = values[Number(value)]
+ const sharedStringIndex = Number(value)
+ if (isNaN(sharedStringIndex)) {
+ throw new Error(`Invalid "shared" string index: ${value}`)
+ }
+ if (sharedStringIndex >= values.length) {
+ throw new Error(`An out-of-bounds "shared" string index: ${value}`)
+ }
+ value = values[sharedStringIndex]
value = parseString(value, options)
break
+ // Boolean (TRUE/FALSE) values are stored as either "1" or "0"
+ // in cells of type "b".
case 'b':
- value = value === '1' ? true : false
+ if (value === '1') {
+ value = true
+ } else if (value === '0') {
+ value = false
+ } else {
+ throw new Error(`Unsupported "boolean" cell value: ${value}`)
+ }
break
- // Stub: blank stub cell that is ignored by data processing utilities.
+ // XLSX specification seems to support cells of type "z":
+ // blank "stub" cells that should be ignored by data processing utilities.
case 'z':
value = undefined
break
- // Error: `value` is a numeric code.
+ // XLSX specification also defines cells of type "e" containing a numeric "error" code.
+ // It's not clear what that means though.
// They also wrote: "and `w` property stores its common name".
// It's unclear what they meant by that.
case 'e':
value = decodeError(value)
break
- // Date: a string to be parsed as a date.
- // (usually a string in "ISO 8601" format)
+ // XLSX supports date cells of type "d", though seems like it (almost?) never
+ // uses it for storing dates, preferring "n" numeric timestamp cells instead.
+ // The value of a "d" cell is supposedly a string in "ISO 8601" format.
+ // I haven't seen an XLSX file having such cells.
+ // Example: `2021-06-10T00:47:45.700Z
`.
case 'd':
if (value === undefined) {
break
}
- value = new Date(value)
+ const parsedDate = new Date(value)
+ if (isNaN(parsedDate)) {
+ throw new Error(`Unsupported "date" cell value: ${value}`)
+ }
+ value = parsedDate
break
+ // Numeric cells have type "n".
case 'n':
if (value === undefined) {
break
}
- value = parseFloat(value)
+ const parsedNumber = Number(value)
+ if (isNaN(parsedNumber)) {
+ throw new Error(`Invalid "numeric" cell value: ${value}`)
+ }
+ value = parsedNumber
// XLSX does have "d" type for dates, but it's not commonly used.
- // Instead, spreadsheets prefer using "n" type for dates for some reason.
- //
- // In such cases, sometimes a "date" type could be heuristically detected
- // by looking at such numeric value "format" and seeing if it's a date-specific one.
- // https://github.com/catamphetamine/read-excel-file/issues/3#issuecomment-395770777
- //
- // The list of generic numeric value "formats":
- // https://xlsxwriter.readthedocs.io/format.html#format-set-num-format
- //
- const styleId = getStyleId()
- if (styleId) {
- // styleId = parseInt(styleId)
- const style = styles[styleId]
- if (!style) {
- throw new Error(`Cell style not found: ${styleId}`)
- }
- if (BUILT_IN_DATE_NUMBER_FORMAT_IDS.indexOf(parseInt(style.numberFormat.id)) >= 0 ||
- (options.dateFormat && style.numberFormat.template === options.dateFormat) ||
- (options.smartDateParser !== false && style.numberFormat.template && isDateTemplate(style.numberFormat.template))) {
- value = parseDate(value, properties)
- }
+ // Instead, it prefers using "n" type for storing dates as timestamps.
+ if (isDateTimestamp(value, getStyleId(), styles, options)) {
+ // Parse the number as a date timestamp.
+ value = parseDate(value, properties)
}
break
@@ -146,53 +164,6 @@ function decodeError(errorCode) {
}
}
-function isDateTemplate(template) {
- // Date format tokens could be in upper case or in lower case.
- // There seems to be no single standard.
- // So lowercase the template first.
- template = template.toLowerCase()
- const tokens = template.split(/\W+/)
- for (const token of tokens) {
- if (DATE_TEMPLATE_TOKENS.indexOf(token) < 0) {
- return false
- }
- }
- return true
-}
-
-// These tokens could be in upper case or in lower case.
-// There seems to be no single standard, so using lower case.
-const DATE_TEMPLATE_TOKENS = [
- // Seconds (min two digits). Example: "05".
- 'ss',
- // Minutes (min two digits). Example: "05". Could also be "Months". Weird.
- 'mm',
- // Hours. Example: "1".
- 'h',
- // Hours (min two digits). Example: "01".
- 'hh',
- // "AM" part of "AM/PM". Lowercased just in case.
- 'am',
- // "PM" part of "AM/PM". Lowercased just in case.
- 'pm',
- // Day. Example: "1"
- 'd',
- // Day (min two digits). Example: "01"
- 'dd',
- // Month (numeric). Example: "1".
- 'm',
- // Month (numeric, min two digits). Example: "01". Could also be "Minutes". Weird.
- 'mm',
- // Month (shortened month name). Example: "Jan".
- 'mmm',
- // Month (full month name). Example: "January".
- 'mmmm',
- // Two-digit year. Example: "20".
- 'yy',
- // Full year. Example: "2020".
- 'yyyy'
-];
-
function parseString(value, options) {
// In some weird cases, a developer might want to disable
// the automatic trimming of all strings.
diff --git a/source/read/parseDate.js b/source/read/parseDate.js
index b5d5c0e..5e8eb2b 100644
--- a/source/read/parseDate.js
+++ b/source/read/parseDate.js
@@ -1,67 +1,67 @@
- // Parses an Excel Date ("serial") into a corresponding javascript Date in UTC+0 timezone.
- // (with time equal to 00:00)
- //
- // https://www.pcworld.com/article/3063622/software/mastering-excel-date-time-serial-numbers-networkdays-datevalue-and-more.html
- // "If you need to calculate dates in your spreadsheets,
- // Excel uses its own unique system, which it calls Serial Numbers".
- //
- export default function parseExcelDate(excelSerialDate, options) {
- // https://support.microsoft.com/en-gb/help/214330/differences-between-the-1900-and-the-1904-date-system-in-excel
- if (options && options.epoch1904) {
- excelSerialDate += 1462
- }
+// Parses an Excel Date ("serial") into a corresponding javascript Date in UTC+0 timezone.
+// (with time equal to 00:00)
+//
+// https://www.pcworld.com/article/3063622/software/mastering-excel-date-time-serial-numbers-networkdays-datevalue-and-more.html
+// "If you need to calculate dates in your spreadsheets,
+// Excel uses its own unique system, which it calls Serial Numbers".
+//
+export default function parseExcelDate(excelSerialDate, options) {
+ // https://support.microsoft.com/en-gb/help/214330/differences-between-the-1900-and-the-1904-date-system-in-excel
+ if (options && options.epoch1904) {
+ excelSerialDate += 1462
+ }
- // "Excel serial date" is just
- // the count of days since `01/01/1900`
- // (seems that it may be even fractional).
- //
- // The count of days elapsed
- // since `01/01/1900` (Excel epoch)
- // till `01/01/1970` (Unix epoch).
- // Accounts for leap years
- // (19 of them, yielding 19 extra days).
- const daysBeforeUnixEpoch = 70 * 365 + 19
+ // "Excel serial date" is just
+ // the count of days since `01/01/1900`
+ // (seems that it may be even fractional).
+ //
+ // The count of days elapsed
+ // since `01/01/1900` (Excel epoch)
+ // till `01/01/1970` (Unix epoch).
+ // Accounts for leap years
+ // (19 of them, yielding 19 extra days).
+ const daysBeforeUnixEpoch = 70 * 365 + 19
- // An hour, approximately, because a minute
- // may be longer than 60 seconds, due to "leap seconds".
- //
- // Still, Javascript `Date` (and UNIX time in general) intentionally
- // drops the concept of "leap seconds" in order to make things simpler.
- // So it's fine.
- // https://stackoverflow.com/questions/53019726/where-are-the-leap-seconds-in-javascript
- //
- // "The JavaScript Date object specifically adheres to the concept of Unix Time
- // (albeit with higher precision). This is part of the POSIX specification,
- // and thus is sometimes called "POSIX Time". It does not count leap seconds,
- // but rather assumes every day had exactly 86,400 seconds. You can read about
- // this in section 20.3.1.1 of the current ECMAScript specification, which states:
- //
- // "Time is measured in ECMAScript in milliseconds since 01 January, 1970 UTC.
- // In time values leap seconds are ignored. It is assumed that there are exactly
- // 86,400,000 milliseconds per day."
- //
- // The fact is, that the unpredictable nature of leap seconds makes them very
- // difficult to work with in APIs. One can't generally pass timestamps around
- // that need leap seconds tables to be interpreted correctly, and expect that
- // one system will interpret them the same as another. For example, while your
- // example timestamp 1483228826 is 2017-01-01T00:00:00Z on your system,
- // it would be interpreted as 2017-01-01T00:00:26Z on POSIX based systems,
- // or systems without leap second tables. So they aren't portable.
- // Even on systems that have full updated tables, there's no telling what those
- // tables will contain in the future (beyond the 6-month IERS announcement period),
- // so I can't produce a future timestamp without risk that it may eventually change.
- //
- // To be clear - to support leap seconds in a programming language, the implementation
- // must go out of its way to do so, and must make tradeoffs that are not always acceptable.
- // Though there are exceptions, the general position is to not support them - not because
- // of any subversion or active countermeasures, but because supporting them properly is much,
- // much harder."
- //
- // https://en.wikipedia.org/wiki/Unix_time#Leap_seconds
- // https://en.wikipedia.org/wiki/Leap_year
- // https://en.wikipedia.org/wiki/Leap_second
- //
- const hour = 60 * 60 * 1000
+ // An hour, approximately, because a minute
+ // may be longer than 60 seconds, due to "leap seconds".
+ //
+ // Still, Javascript `Date` (and UNIX time in general) intentionally
+ // drops the concept of "leap seconds" in order to make things simpler.
+ // So it's fine.
+ // https://stackoverflow.com/questions/53019726/where-are-the-leap-seconds-in-javascript
+ //
+ // "The JavaScript Date object specifically adheres to the concept of Unix Time
+ // (albeit with higher precision). This is part of the POSIX specification,
+ // and thus is sometimes called "POSIX Time". It does not count leap seconds,
+ // but rather assumes every day had exactly 86,400 seconds. You can read about
+ // this in section 20.3.1.1 of the current ECMAScript specification, which states:
+ //
+ // "Time is measured in ECMAScript in milliseconds since 01 January, 1970 UTC.
+ // In time values leap seconds are ignored. It is assumed that there are exactly
+ // 86,400,000 milliseconds per day."
+ //
+ // The fact is, that the unpredictable nature of leap seconds makes them very
+ // difficult to work with in APIs. One can't generally pass timestamps around
+ // that need leap seconds tables to be interpreted correctly, and expect that
+ // one system will interpret them the same as another. For example, while your
+ // example timestamp 1483228826 is 2017-01-01T00:00:00Z on your system,
+ // it would be interpreted as 2017-01-01T00:00:26Z on POSIX based systems,
+ // or systems without leap second tables. So they aren't portable.
+ // Even on systems that have full updated tables, there's no telling what those
+ // tables will contain in the future (beyond the 6-month IERS announcement period),
+ // so I can't produce a future timestamp without risk that it may eventually change.
+ //
+ // To be clear - to support leap seconds in a programming language, the implementation
+ // must go out of its way to do so, and must make tradeoffs that are not always acceptable.
+ // Though there are exceptions, the general position is to not support them - not because
+ // of any subversion or active countermeasures, but because supporting them properly is much,
+ // much harder."
+ //
+ // https://en.wikipedia.org/wiki/Unix_time#Leap_seconds
+ // https://en.wikipedia.org/wiki/Leap_year
+ // https://en.wikipedia.org/wiki/Leap_second
+ //
+ const hour = 60 * 60 * 1000
- return new Date(Math.round((excelSerialDate - daysBeforeUnixEpoch) * 24 * hour))
- }
\ No newline at end of file
+ return new Date(Math.round((excelSerialDate - daysBeforeUnixEpoch) * 24 * hour))
+}
\ No newline at end of file
diff --git a/source/read/schema/convertToJson.js b/source/read/schema/convertToJson.js
index b3f76c2..0aa7ef7 100644
--- a/source/read/schema/convertToJson.js
+++ b/source/read/schema/convertToJson.js
@@ -76,6 +76,7 @@ function read(schema, row, rowIndex, columns, errors, options) {
}
let value
let error
+ let reason
if (isNestedSchema) {
value = read(schemaEntry.type, row, rowIndex, columns, errors, options)
} else {
@@ -89,6 +90,7 @@ function read(schema, row, rowIndex, columns, errors, options) {
if (result.error) {
value = _value
error = result.error
+ reason = result.reason
}
if (result.value !== null) {
notEmpty = true
@@ -101,6 +103,7 @@ function read(schema, row, rowIndex, columns, errors, options) {
} else {
const result = parseValue(rawValue, schemaEntry, options)
error = result.error
+ reason = result.reason
value = error ? rawValue : result.value
}
}
@@ -114,6 +117,9 @@ function read(schema, row, rowIndex, columns, errors, options) {
column: key,
value
}
+ if (reason) {
+ error.reason = reason
+ }
if (schemaEntry.type) {
error.type = schemaEntry.type
}
@@ -162,7 +168,7 @@ export function parseValue(value, schemaEntry, options) {
}
if (result.value !== null) {
if (schemaEntry.oneOf && schemaEntry.oneOf.indexOf(result.value) < 0) {
- return { error: 'invalid' }
+ return { error: 'invalid', reason: 'unknown' }
}
if (schemaEntry.validate) {
try {
@@ -197,7 +203,7 @@ function parseCustomValue(value, parse) {
* Converts textual value to a javascript typed value.
* @param {any} value
* @param {} type
- * @return {{ value: (string|number|Date|boolean), error: string }}
+ * @return {{ value: (string|number|Date|boolean), error: string, reason?: string }}
*/
function parseValueOfType(value, type, options) {
switch (type) {
@@ -205,43 +211,64 @@ function parseValueOfType(value, type, options) {
if (typeof value === 'string') {
return { value }
}
- // The global `isFinite()` function filters out:
- // * NaN
- // * -Infinity
- // * Infinity
- // All other values pass (including non-numbers).
+ // Excel tends to perform a forced automatic convertion of string-type values
+ // to number-type ones when the user has input them. Otherwise, users wouldn't
+ // be able to perform formula calculations on those cell values because users
+ // won't bother manually choosing a "numeric" cell type for each cell, and
+ // even if they did, choosing a "numeric" cell type every time wouldn't be an
+ // acceptable "user experience".
+ //
+ // So, if a cell value is supposed to be a string and Excel has automatically
+ // converted it to a number, perform a backwards conversion.
+ //
if (typeof value === 'number') {
+ // The global `isFinite()` function filters out:
+ // * NaN
+ // * -Infinity
+ // * Infinity
+ //
+ // All other values pass (including non-numbers).
+ //
if (isFinite(value)) {
return { value: String(value) }
}
+ return { error: 'invalid', reason: 'not_a_number' }
}
- return { error: 'invalid' }
+ return { error: 'invalid', reason: 'not_a_string' }
case Number:
case Integer:
- // Convert strings to numbers.
- // Just an additional feature.
- // Won't happen when called from `readXlsx()`.
+ // An XLSX file editing software might not always correctly
+ // detect numeric values in string-type cells. Users won't bother
+ // manually selecting a cell type, so the editing software has to guess
+ // based on the user's input. One can assume that such auto-detection
+ // might not always work.
+ //
+ // So, if a cell is supposed to be a numeric one, convert a string value to a number.
+ //
if (typeof value === 'string') {
const stringifiedValue = value
- value = parseFloat(value)
+ value = Number(value)
if (String(value) !== stringifiedValue) {
- return { error: 'invalid' }
+ return { error: 'invalid', reason: 'not_a_number_string' }
}
} else if (typeof value !== 'number') {
- return { error: 'invalid' }
+ return { error: 'invalid', reason: 'not_a_number' }
}
+ // At this point, `value` can only be a number.
+ //
// The global `isFinite()` function filters out:
// * NaN
// * -Infinity
// * Infinity
+ //
// All other values pass (including non-numbers).
- // At this point, `value` can only be a number.
+ //
if (!isFinite(value)) {
- return { error: 'invalid' }
+ return { error: 'invalid', reason: 'not_a_number' }
}
if (type === Integer && !isInteger(value)) {
- return { error: 'invalid' }
+ return { error: 'invalid', reason: 'not_an_integer' }
}
return { value }
@@ -250,42 +277,47 @@ function parseValueOfType(value, type, options) {
if (isURL(value)) {
return { value }
}
+ return { error: 'invalid', reason: 'not_a_url' }
}
- return { error: 'invalid' }
+ return { error: 'invalid', reason: 'not_a_string' }
case Email:
if (typeof value === 'string') {
if (isEmail(value)) {
return { value }
}
+ return { error: 'invalid', reason: 'not_an_email' }
}
- return { error: 'invalid' }
+ return { error: 'invalid', reason: 'not_a_string' }
case Date:
// XLSX has no specific format for dates.
// Sometimes a date can be heuristically detected.
// https://github.com/catamphetamine/read-excel-file/issues/3#issuecomment-395770777
if (value instanceof Date) {
+ if (isNaN(value)) {
+ return { error: 'invalid', reason: 'out_of_bounds' }
+ }
return { value }
}
if (typeof value === 'number') {
if (!isFinite(value)) {
- return { error: 'invalid' }
+ return { error: 'invalid', reason: 'not_a_number' }
}
- value = parseInt(value)
+ value = Number(value)
const date = parseDate(value, options.properties)
- if (!date) {
- return { error: 'invalid' }
+ if (isNaN(date)) {
+ return { error: 'invalid', reason: 'out_of_bounds' }
}
return { value: date }
}
- return { error: 'invalid' }
+ return { error: 'invalid', reason: 'not_a_number' }
case Boolean:
if (typeof value === 'boolean') {
return { value }
}
- return { error: 'invalid' }
+ return { error: 'invalid', reason: 'not_a_boolean' }
default:
if (typeof type === 'function') {
diff --git a/source/read/schema/convertToJson.test.js b/source/read/schema/convertToJson.test.js
index c2706b7..ba2acc1 100644
--- a/source/read/schema/convertToJson.test.js
+++ b/source/read/schema/convertToJson.test.js
@@ -183,9 +183,14 @@ describe('convertToJson', () => {
})
errors.length.should.equal(1)
- errors[0].row.should.equal(2)
- errors[0].column.should.equal('INTEGER')
- errors[0].error.should.equal('invalid')
+ errors[0].should.deep.equal({
+ error: 'invalid',
+ reason: 'not_an_integer',
+ row: 2,
+ column: 'INTEGER',
+ type: Integer,
+ value: '1.2'
+ })
rows.should.deep.equal([{
value: 1
@@ -294,6 +299,7 @@ describe('convertToJson', () => {
errors.should.deep.equal([{
error: 'invalid',
+ reason: 'not_a_number_string',
row: 1,
column: 'NUMBER',
type: Number,
@@ -334,6 +340,7 @@ describe('convertToJson', () => {
errors.should.deep.equal([{
error: 'invalid',
+ reason: 'not_a_boolean',
row: 1,
column: 'INVALID',
type: Boolean,
@@ -373,12 +380,14 @@ describe('convertToJson', () => {
errors.should.deep.equal([{
error: 'invalid',
+ reason: 'not_a_number',
row: 1,
column: 'INVALID',
type: Date,
value: '-'
}, {
error: 'invalid',
+ reason: 'not_a_number',
row: 2,
column: 'INVALID',
type: Date,
@@ -449,6 +458,7 @@ describe('convertToJson', () => {
errors.should.deep.equal([{
error: 'invalid',
+ reason: 'not_a_number_string',
row: 6,
column: 'NUMBER',
type: Number,
@@ -499,6 +509,7 @@ describe('convertToJson', () => {
errors.should.deep.equal([{
error: 'invalid',
+ reason: 'unknown',
row: 1,
column: 'STATUS',
type: String,
diff --git a/source/xml/dom.js b/source/xml/dom.js
index 2adc106..69c8eef 100644
--- a/source/xml/dom.js
+++ b/source/xml/dom.js
@@ -65,4 +65,33 @@ export function getTagName(element) {
// when getting `.tagName`, so just replacing anything
// before a colon, if any.
return element.tagName.replace(NAMESPACE_REG_EXP, '')
+}
+
+// This function is only used for occasional debug messages.
+export function getOuterXml(node) {
+ // `nodeType: 1` means "Element".
+ // https://www.w3schools.com/xml/prop_element_nodetype.asp
+ if (node.nodeType !== 1) {
+ return node.textContent
+ }
+
+ let xml = '<' + getTagName(node)
+
+ let j = 0
+ while (j < node.attributes.length) {
+ xml += ' ' + node.attributes[j].name + '=' + '"' + node.attributes[j].value + '"'
+ j++
+ }
+
+ xml += '>'
+
+ let i = 0
+ while (i < node.childNodes.length) {
+ xml += getOuterXml(node.childNodes[i])
+ i++
+ }
+
+ xml += '' + getTagName(node) + '>'
+
+ return xml
}
\ No newline at end of file
diff --git a/source/xml/xlsx.js b/source/xml/xlsx.js
index e53618c..0d557ab 100644
--- a/source/xml/xlsx.js
+++ b/source/xml/xlsx.js
@@ -3,6 +3,7 @@ import { findChild, findChildren, forEach, map, getTagName } from './dom.js'
export function getCells(document) {
const worksheet = document.documentElement
const sheetData = findChild(worksheet, 'sheetData')
+
const cells = []
forEach(sheetData, 'row', (row) => {
forEach(row, 'c', (cell) => {
@@ -29,10 +30,12 @@ export function getCellValue(document, node) {
}
export function getCellInlineStringValue(document, node) {
- if (node.firstChild &&
+ if (
+ node.firstChild &&
getTagName(node.firstChild) === 'is' &&
node.firstChild.firstChild &&
- getTagName(node.firstChild.firstChild) === 't') {
+ getTagName(node.firstChild.firstChild) === 't'
+ ) {
return node.firstChild.firstChild.textContent
}
}
diff --git a/source/xml/xml.js b/source/xml/xml.js
index d1bd413..fd3f8dd 100644
--- a/source/xml/xml.js
+++ b/source/xml/xml.js
@@ -1,7 +1,7 @@
-import XMLDOM from '@xmldom/xmldom'
+import { DOMParser } from '@xmldom/xmldom'
export default {
createDocument(content) {
- return new XMLDOM.DOMParser().parseFromString(content)
+ return new DOMParser().parseFromString(content)
}
}
\ No newline at end of file
diff --git a/source/xml/xpath/README.md b/source/xml/xpath/README.md
new file mode 100644
index 0000000..bcd9a66
--- /dev/null
+++ b/source/xml/xpath/README.md
@@ -0,0 +1,5 @@
+`xlsx-xpath.js` is an "alternative" implementation of `./xml/xlsx.js` functions using the [`XPath`](https://www.w3schools.com/xml/xpath_syntax.asp) XML document query language.
+
+`XPath` is no longer used in this project and has been substituted with a simpler set of functions defined in `./xml/dom.js` that're used in `./xml/xlsx.js`.
+
+The reason is that `xpathBrowser.js` turned out to be [not supported](https://github.com/catamphetamine/read-excel-file/issues/26) in Internet Explorer 11, and including a [polyfill](https://www.npmjs.com/package/xpath) for `XPath` (`xpathNode.js`) would increase the bundle size by about 100 kilobytes.
\ No newline at end of file
diff --git a/source/xml/xpath/xlsx-xpath.js b/source/xml/xpath/xlsx-xpath.js
new file mode 100644
index 0000000..f4f2e9c
--- /dev/null
+++ b/source/xml/xpath/xlsx-xpath.js
@@ -0,0 +1,84 @@
+// This file is no longer used.
+
+// Turns out IE11 doesn't support XPath, so not using `./xpathBrowser` for browsers.
+// https://github.com/catamphetamine/read-excel-file/issues/26
+// The inclusion of `xpath` package in `./xpathNode`
+// increases the bundle size by about 100 kilobytes.
+// IE11 is a wide-spread browser and it's unlikely that
+// anyone would ignore it for now.
+// There could be a separate export `read-excel-file/ie11`
+// for using `./xpathNode` instead of `./xpathBrowser`
+// but this library has been migrated to not using `xpath` anyway.
+// This code is just alternative/historical now, it seems.
+import xpath from './xpathNode'
+
+const namespaces = {
+ a: 'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
+ // This one seems to be for `r:id` attributes on ``s.
+ r: 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
+ // This one seems to be for `` file.
+ rr: 'http://schemas.openxmlformats.org/package/2006/relationships'
+}
+
+export function getCells(document) {
+ return xpath(document, null, '/a:worksheet/a:sheetData/a:row/a:c', namespaces)
+}
+
+export function getMergedCells(document) {
+ return xpath(document, null, '/a:worksheet/a:mergedCells/a:mergedCell/@ref', namespaces)
+}
+
+export function getCellValue(document, node) {
+ return xpath(document, node, './a:v', namespaces)[0]
+}
+
+export function getCellInlineStringValue(document, node) {
+ return xpath(document, node, './a:is/a:t', namespaces)[0].textContent
+}
+
+export function getDimensions(document) {
+ const dimensions = xpath(document, null, '/a:worksheet/a:dimension/@ref', namespaces)[0]
+ if (dimensions) {
+ return dimensions.textContent
+ }
+}
+
+export function getBaseStyles(document) {
+ return xpath(document, null, '/a:styleSheet/a:cellStyleXfs/a:xf', namespaces)
+}
+
+export function getCellStyles(document) {
+ return xpath(document, null, '/a:styleSheet/a:cellXfs/a:xf', namespaces)
+}
+
+export function getNumberFormats(document) {
+ return xpath(document, null, '/a:styleSheet/a:numFmts/a:numFmt', namespaces)
+}
+
+export function getSharedStrings(document) {
+ // An `` element can contain a `` (simplest case) or a set of `` ("rich formatting") elements having ``.
+ // https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.spreadsheet.sharedstringitem?redirectedfrom=MSDN&view=openxml-2.8.1
+ // http://www.datypic.com/sc/ooxml/e-ssml_si-1.html
+
+ // The ".//a:t[not(ancestor::a:rPh)]" selector means:
+ // "select all `` that are not children of ``".
+ // https://stackoverflow.com/questions/42773772/xpath-span-what-does-the-dot-mean
+ // `` seems to be some "phonetic data" added for languages like Japanese that should be ignored.
+ // https://github.com/doy/spreadsheet-parsexlsx/issues/72
+ return xpath(document, null, '/a:sst/a:si', namespaces)
+ .map(string => xpath(document, string, './/a:t[not(ancestor::a:rPh)]', namespaces)
+ .map(_ => _.textContent).join('')
+ )
+}
+
+export function getWorkbookProperties(document) {
+ return xpath(document, null, '/a:workbook/a:workbookPr', namespaces)[0]
+}
+
+export function getRelationships(document) {
+ return xpath(document, null, '/rr:Relationships/rr:Relationship', namespaces)
+}
+
+export function getSheets(document) {
+ return xpath(document, null, '/a:workbook/a:sheets/a:sheet', namespaces)
+}
\ No newline at end of file
diff --git a/source/xml/xpath/xpathBrowser.js b/source/xml/xpath/xpathBrowser.js
new file mode 100644
index 0000000..aa69681
--- /dev/null
+++ b/source/xml/xpath/xpathBrowser.js
@@ -0,0 +1,29 @@
+// This file is no longer used.
+
+// Turns out IE11 doesn't support XPath, so not using `./xpathBrowser` for browsers.
+// https://github.com/catamphetamine/read-excel-file/issues/26
+// The inclusion of `xpath` package in `./xpathNode`
+// increases the bundle size by about 100 kilobytes.
+// IE11 is a wide-spread browser and it's unlikely that
+// anyone would ignore it for now.
+// There could be a separate export `read-excel-file/ie11`
+// for using `./xpathNode` instead of `./xpathBrowser`
+// but this library has been migrated to not using `xpath` anyway.
+// This code is just alternative/historical now, it seems.
+export default function xpath(document, node, path, namespaces = {}) {
+ const nodes = document.evaluate(
+ path,
+ node || document,
+ prefix => namespaces[prefix],
+ XPathResult.ANY_TYPE,
+ null
+ )
+ // Convert iterator to an array.
+ const results = []
+ let result = nodes.iterateNext()
+ while (result) {
+ results.push(result)
+ result = nodes.iterateNext()
+ }
+ return results
+}
\ No newline at end of file
diff --git a/source/xml/xpath/xpathNode.js b/source/xml/xpath/xpathNode.js
new file mode 100644
index 0000000..338dc53
--- /dev/null
+++ b/source/xml/xpath/xpathNode.js
@@ -0,0 +1,8 @@
+// This file is no longer used.
+
+import xpath from 'xpath'
+
+export default function(document, node, path, namespaces = {}) {
+ const select = xpath.useNamespaces(namespaces)
+ return select(path, node || document)
+}
\ No newline at end of file
diff --git a/test/test.test.js b/test/test.test.js
index 0273ccc..bba6697 100644
--- a/test/test.test.js
+++ b/test/test.test.js
@@ -1,40 +1,29 @@
-import parseExcel from '../source/read/readXlsxFileNode.js'
-import assert from 'assert'
+import parseXlsx from '../source/read/readXlsxFileNode.js'
-function parseXlsx(path, sheet, callback) {
- if (typeof callback === 'undefined') {
- callback = sheet;
- sheet = '1';
- }
- parseExcel(path, sheet).then((data) => callback(null, data), callback);
-}
+const sheetsDir = './test/spreadsheets'
-var sheetsDir = './test/spreadsheets';
-var sheets = {
+const sheets = {
'excel_mac_2011-basic.xlsx': [ [ 'One', 'Two' ], [ 'Three', 'Four' ] ],
'excel_mac_2011-formatting.xlsx': [ [ 'Hey', 'now', 'so' ], [ 'cool', null, null ] ],
- 'excel_multiple_text_nodes.xlsx': [ [ 'id', 'memo' ], [ '1.0', 'abc def ghi' ], [ '2.0', 'pqr stu' ] ]
-};
+ 'excel_multiple_text_nodes.xlsx': [ [ 'id', 'memo' ], [ 1, 'abc def ghi' ], [ 2, 'pqr stu' ] ]
+}
-describe('excel.js', function() {
- for (var filename in sheets) {
+describe('read-excel-file', function() {
+ for (const filename in sheets) {
+ // Creates a javascript "closure".
+ // Otherwise, in every test, `expected` variable value would be equal
+ // to the last `for` cycle's `expected` variable value.
(function(filename, expected) {
-
describe(filename + ' basic test', function() {
- it('should return the right value', function(done) {
- parseXlsx(sheetsDir + '/' + filename, function(err, data) {
- assert.deepEqual(data, expected);
- done(err);
- });
+ it('should return the right value', async function() {
+ const result = await parseXlsx(sheetsDir + '/' + filename)
+ expect(result).to.deep.equal(expected)
})
- it('should return the right value with the sheet specified', function(done) {
- parseXlsx(sheetsDir + '/' + filename, '1', function(err, data) {
- assert.deepEqual(data, expected);
- done(err);
- });
+ it('should return the right value with the sheet specified', async function() {
+ const result = await parseXlsx(sheetsDir + '/' + filename, '1')
+ expect(result).to.deep.equal(expected)
})
- });
-
- })(filename, sheets[filename]);
+ })
+ })(filename, sheets[filename])
}
-});
+})
\ No newline at end of file
diff --git a/types.d.ts b/types.d.ts
index 53502cf..42b02c5 100644
--- a/types.d.ts
+++ b/types.d.ts
@@ -43,6 +43,7 @@ export type Schema = Record
export interface Error {
error: string;
+ reason?: string;
row: number;
column: string;
value?: any;