diff --git a/examples/excel_to_csv.rs b/examples/excel_to_csv.rs index 7f238dc4..7eb370ef 100644 --- a/examples/excel_to_csv.rs +++ b/examples/excel_to_csv.rs @@ -36,7 +36,7 @@ fn write_range(dest: &mut W, range: &Range) -> std::io::Resu match *c { DataType::Empty => Ok(()), DataType::String(ref s) => write!(dest, "{}", s), - DataType::Float(ref f) => write!(dest, "{}", f), + DataType::Float(ref f) | DataType::DateTime(ref f) => write!(dest, "{}", f), DataType::Int(ref i) => write!(dest, "{}", i), DataType::Error(ref e) => write!(dest, "{:?}", e), DataType::Bool(ref b) => write!(dest, "{}", b), diff --git a/src/datatype.rs b/src/datatype.rs index ce03d2c9..652a10ac 100644 --- a/src/datatype.rs +++ b/src/datatype.rs @@ -17,6 +17,8 @@ pub enum DataType { String(String), /// Boolean Bool(bool), + /// Date or Time + DateTime(f64), /// Error Error(CellErrorType), /// Empty cell @@ -121,7 +123,7 @@ impl DataType { let secs = days * 86400; chrono::NaiveDateTime::from_timestamp_opt(secs, 0) } - DataType::Float(f) => { + DataType::Float(f) | DataType::DateTime(f) => { let unix_days = f - 25569.; let unix_secs = unix_days * 86400.; let secs = unix_secs.trunc() as i64; @@ -176,6 +178,7 @@ impl fmt::Display for DataType { DataType::Float(ref e) => write!(f, "{}", e), DataType::String(ref e) => write!(f, "{}", e), DataType::Bool(ref e) => write!(f, "{}", e), + DataType::DateTime(ref e) => write!(f, "{}", e), DataType::Error(ref e) => write!(f, "{}", e), DataType::Empty => Ok(()), } diff --git a/src/de.rs b/src/de.rs index 2d3e1485..a5ee3279 100644 --- a/src/de.rs +++ b/src/de.rs @@ -571,6 +571,7 @@ impl<'a, 'de> serde::Deserializer<'de> for DataTypeDeserializer<'a> { DataType::Bool(v) => visitor.visit_bool(*v), DataType::Int(v) => visitor.visit_i64(*v), DataType::Empty => visitor.visit_unit(), + DataType::DateTime(v) => visitor.visit_f64(*v), DataType::Error(ref err) => Err(DeError::CellError { err: err.clone(), pos: self.pos, @@ -588,6 +589,7 @@ impl<'a, 'de> serde::Deserializer<'de> for DataTypeDeserializer<'a> { DataType::Float(v) => visitor.visit_str(&v.to_string()), DataType::Int(v) => visitor.visit_str(&v.to_string()), DataType::Bool(v) => visitor.visit_str(&v.to_string()), + DataType::DateTime(v) => visitor.visit_str(&v.to_string()), DataType::Error(ref err) => Err(DeError::CellError { err: err.clone(), pos: self.pos, @@ -638,6 +640,7 @@ impl<'a, 'de> serde::Deserializer<'de> for DataTypeDeserializer<'a> { DataType::Empty => visitor.visit_bool(false), DataType::Float(v) => visitor.visit_bool(*v != 0.), DataType::Int(v) => visitor.visit_bool(*v != 0), + DataType::DateTime(v) => visitor.visit_bool(*v != 0.), DataType::Error(ref err) => Err(DeError::CellError { err: err.clone(), pos: self.pos, diff --git a/src/xlsx.rs b/src/xlsx.rs index 257267bb..694205ab 100644 --- a/src/xlsx.rs +++ b/src/xlsx.rs @@ -134,6 +134,12 @@ impl FromStr for CellErrorType { } } +#[derive(Debug)] +enum CellFormat { + Other, + Date, +} + /// A struct representing xml zipped excel file /// Xlsx, Xlsm, Xlam pub struct Xlsx @@ -145,6 +151,8 @@ where strings: Vec, /// Sheets paths sheets: Vec<(String, String)>, + /// Cell (number) formats + formats: Vec, /// Metadata metadata: Metadata, } @@ -173,6 +181,82 @@ impl Xlsx { Ok(()) } + fn read_styles(&mut self) -> Result<(), XlsxError> { + let mut xml = match xml_reader(&mut self.zip, "xl/styles.xml") { + None => return Ok(()), + Some(x) => x?, + }; + + let mut number_formats = HashMap::new(); + + let mut buf = Vec::new(); + let mut inner_buf = Vec::new(); + loop { + buf.clear(); + match xml.read_event(&mut buf) { + Ok(Event::Start(ref e)) if e.local_name() == b"numFmts" => loop { + inner_buf.clear(); + match xml.read_event(&mut inner_buf) { + Ok(Event::Start(ref e)) if e.local_name() == b"numFmt" => { + let mut id = Vec::new(); + let mut format = String::new(); + for a in e.attributes() { + match a? { + Attribute { + key: b"numFmtId", + value: v, + } => id.extend_from_slice(&v), + Attribute { + key: b"formatCode", + value: v, + } => format = xml.decode(&v).into_owned(), + _ => (), + } + } + number_formats.insert(id, format); + } + Ok(Event::End(ref e)) if e.local_name() == b"numFmts" => break, + Ok(Event::Eof) => return Err(XlsxError::XmlEof("numFmts")), + Err(e) => return Err(XlsxError::Xml(e)), + _ => (), + } + }, + Ok(Event::Start(ref e)) if e.local_name() == b"cellXfs" => loop { + inner_buf.clear(); + match xml.read_event(&mut inner_buf) { + Ok(Event::Start(ref e)) if e.local_name() == b"xf" => { + self.formats.push( + e.attributes() + .filter_map(|a| a.ok()) + .find(|a| a.key == b"numFmtId") + .map_or(CellFormat::Other, |a| { + match number_formats.get(&*a.value) { + Some(fmt) if is_custom_date_format(fmt) => { + CellFormat::Date + } + None if is_builtin_date_format_id(&a.value) => { + CellFormat::Date + } + _ => CellFormat::Other, + } + }), + ); + } + Ok(Event::End(ref e)) if e.local_name() == b"cellXfs" => break, + Ok(Event::Eof) => return Err(XlsxError::XmlEof("cellXfs")), + Err(e) => return Err(XlsxError::Xml(e)), + _ => (), + } + }, + Ok(Event::End(ref e)) if e.local_name() == b"styleSheet" => break, + Ok(Event::Eof) => return Err(XlsxError::XmlEof("styleSheet")), + Err(e) => return Err(XlsxError::Xml(e)), + _ => (), + } + } + Ok(()) + } + fn read_workbook(&mut self, relationships: &HashMap, String>) -> Result<(), XlsxError> { let mut xml = match xml_reader(&mut self.zip, "xl/workbook.xml") { None => return Ok(()), @@ -280,12 +364,18 @@ impl Xlsx { fn worksheet( strings: &[String], + formats: &[CellFormat], mut xml: XlsReader<'_>, read_data: &mut F, ) -> Result, XlsxError> where T: Default + Clone + PartialEq, - F: FnMut(&[String], &mut XlsReader<'_>, &mut Vec>) -> Result<(), XlsxError>, + F: FnMut( + &[String], + &[CellFormat], + &mut XlsReader<'_>, + &mut Vec>, + ) -> Result<(), XlsxError>, { let mut cells = Vec::new(); let mut buf = Vec::new(); @@ -313,7 +403,7 @@ where return Err(XlsxError::UnexpectedNode("dimension")); } b"sheetData" => { - read_data(&strings, &mut xml, &mut cells)?; + read_data(&strings, &formats, &mut xml, &mut cells)?; break; } _ => (), @@ -338,10 +428,12 @@ impl Reader for Xlsx { let mut xlsx = Xlsx { zip: ZipArchive::new(reader)?, strings: Vec::new(), + formats: Vec::new(), sheets: Vec::new(), metadata: Metadata::default(), }; xlsx.read_shared_strings()?; + xlsx.read_styles()?; let relationships = xlsx.read_relationships()?; xlsx.read_workbook(&relationships)?; Ok(xlsx) @@ -366,9 +458,10 @@ impl Reader for Xlsx { None => return None, }; let strings = &self.strings; + let formats = &self.formats; xml.map(|xml| { - worksheet(strings, xml?, &mut |s, xml, cells| { - read_sheet_data(xml, s, cells) + worksheet(strings, formats, xml?, &mut |s, f, xml, cells| { + read_sheet_data(xml, s, f, cells) }) }) } @@ -380,8 +473,9 @@ impl Reader for Xlsx { }; let strings = &self.strings; + let formats = &self.formats; xml.map(|xml| { - worksheet(strings, xml?, &mut |_, xml, cells| { + worksheet(strings, formats, xml?, &mut |_, _, xml, cells| { read_sheet(xml, cells, &mut |cells, xml, e, pos, _| { match e.local_name() { b"is" | b"v" => xml.read_to_end(e.name(), &mut Vec::new())?, @@ -483,15 +577,25 @@ where fn read_sheet_data( xml: &mut XlsReader<'_>, strings: &[String], + formats: &[CellFormat], cells: &mut Vec>, ) -> Result<(), XlsxError> { /// read the contents of a cell fn read_value<'a>( v: String, strings: &[String], - atts: Attributes<'a>, + formats: &[CellFormat], + c_element: &BytesStart<'a>, ) -> Result { - match get_attribute(atts, b"t")? { + let is_date_time = match get_attribute(c_element.attributes(), b"s") { + Ok(Some(style)) => { + let id: usize = std::str::from_utf8(style).unwrap_or("0").parse()?; + matches!(formats.get(id), Some(CellFormat::Date)) + } + _ => false, + }; + + match get_attribute(c_element.attributes(), b"t")? { Some(b"s") => { // shared string let idx: usize = v.parse()?; @@ -530,16 +634,34 @@ fn read_sheet_data( Some(b"n") => { // n - number v.parse() - .map(DataType::Float) + .map(|n| { + if is_date_time { + DataType::DateTime(n) + } else { + DataType::Float(n) + } + }) .map_err(XlsxError::ParseFloat) } None => { // If type is not known, we try to parse as Float for utility, but fall back to // String if this fails. - v.parse() - .map(DataType::Float) + let data = v + .parse() + .map(|n| { + if is_date_time { + DataType::DateTime(n) + } else { + DataType::Float(n) + } + }) .map_err(XlsxError::ParseFloat) - .or_else::(|_| Ok(DataType::String(v))) + .or_else::(|_| Ok(DataType::String(v))); + + match (data, is_date_time) { + (Ok(DataType::Float(n)), true) => Ok(DataType::DateTime(n)), + (data, _) => data, + } } Some(b"is") => { // this case should be handled in outer loop over cell elements, in which @@ -566,7 +688,7 @@ fn read_sheet_data( b"v" => { // value let v = xml.read_text(e.name(), &mut Vec::new())?; - match read_value(v, strings, c_element.attributes())? { + match read_value(v, strings, formats, c_element)? { DataType::Empty => (), v => cells.push(Cell::new(pos, v)), } @@ -578,6 +700,50 @@ fn read_sheet_data( }) } +// This tries to detect number formats that are definitely date/time formats. +// This is definitely not perfect! +fn is_custom_date_format(format: &String) -> bool { + for chr in format.bytes() { + match chr { + b'm' | b'd' | b'y' | b'M' | b'D' | b'Y' | b'h' | b's' | b'H' | b'S' | b'-' | b'/' + | b'.' | b' ' | b'\\' => (), + _ => return false, + } + } + + return true; +} + +fn is_builtin_date_format_id(id: &[u8]) -> bool { + match id { + // mm-dd-yy + b"14" | + // d-mmm-yy + b"15" | + // d-mmm + b"16" | + // mmm-yy + b"17" | + // h:mm AM/PM + b"18" | + // h:mm:ss AM/PM + b"19" | + // h:mm + b"20" | + // h:mm:ss + b"21" | + // m/d/yy h:mm + b"22" | + // mm:ss + b"45" | + // [h]:mm:ss + b"46" | + // mmss.0 + b"47" => true, + _ => false + } +} + #[derive(Debug, PartialEq)] struct Dimensions { start: (u32, u32), diff --git a/tests/date.xlsx b/tests/date.xlsx new file mode 100644 index 00000000..a9c0a285 Binary files /dev/null and b/tests/date.xlsx differ diff --git a/tests/test.rs b/tests/test.rs index f3d5c7bf..7301273c 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1,5 +1,5 @@ use calamine::CellErrorType::*; -use calamine::DataType::{Bool, Empty, Error, Float, String}; +use calamine::DataType::{Bool, DateTime, Empty, Error, Float, String}; use calamine::{open_workbook, open_workbook_auto, Ods, Reader, Xls, Xlsb, Xlsx}; use std::io::Cursor; use std::sync::Once; @@ -675,3 +675,20 @@ fn issue_174() { let mut xls: Xlsx<_> = open_workbook(&path).unwrap(); xls.worksheet_range_at(0).unwrap().unwrap(); } + +#[test] +fn date() { + setup(); + + let path = format!("{}/tests/date.xlsx", env!("CARGO_MANIFEST_DIR")); + let mut xls: Xlsx<_> = open_workbook(&path).unwrap(); + let range = xls.worksheet_range_at(0).unwrap().unwrap(); + + assert_eq!(range.get_value((0, 0)), Some(&DateTime(44197.0))); + + #[cfg(feature = "dates")] + { + let date = chrono::NaiveDate::from_ymd(2021, 01, 01); + assert_eq!(range.get_value((0, 0)).unwrap().as_date(), Some(date)); + } +}