Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect date/time formatted cells in XLSX #198

Merged
merged 1 commit into from
Feb 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/excel_to_csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ fn write_range<W: Write>(dest: &mut W, range: &Range<DataType>) -> std::io::Resu
match *c {
DataType::Empty => Ok(()),
DataType::String(ref s) => write!(dest, "{}", s),
DataType::Float(ref f) => write!(dest, "{}", f),
DataType::Float(ref f) | DataType::DateTime(ref f) => write!(dest, "{}", f),
DataType::Int(ref i) => write!(dest, "{}", i),
DataType::Error(ref e) => write!(dest, "{:?}", e),
DataType::Bool(ref b) => write!(dest, "{}", b),
Expand Down
5 changes: 4 additions & 1 deletion src/datatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ pub enum DataType {
String(String),
/// Boolean
Bool(bool),
/// Date or Time
DateTime(f64),
/// Error
Error(CellErrorType),
/// Empty cell
Expand Down Expand Up @@ -121,7 +123,7 @@ impl DataType {
let secs = days * 86400;
chrono::NaiveDateTime::from_timestamp_opt(secs, 0)
}
DataType::Float(f) => {
DataType::Float(f) | DataType::DateTime(f) => {
let unix_days = f - 25569.;
let unix_secs = unix_days * 86400.;
let secs = unix_secs.trunc() as i64;
Expand Down Expand Up @@ -176,6 +178,7 @@ impl fmt::Display for DataType {
DataType::Float(ref e) => write!(f, "{}", e),
DataType::String(ref e) => write!(f, "{}", e),
DataType::Bool(ref e) => write!(f, "{}", e),
DataType::DateTime(ref e) => write!(f, "{}", e),
DataType::Error(ref e) => write!(f, "{}", e),
DataType::Empty => Ok(()),
}
Expand Down
3 changes: 3 additions & 0 deletions src/de.rs
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,7 @@ impl<'a, 'de> serde::Deserializer<'de> for DataTypeDeserializer<'a> {
DataType::Bool(v) => visitor.visit_bool(*v),
DataType::Int(v) => visitor.visit_i64(*v),
DataType::Empty => visitor.visit_unit(),
DataType::DateTime(v) => visitor.visit_f64(*v),
DataType::Error(ref err) => Err(DeError::CellError {
err: err.clone(),
pos: self.pos,
Expand All @@ -588,6 +589,7 @@ impl<'a, 'de> serde::Deserializer<'de> for DataTypeDeserializer<'a> {
DataType::Float(v) => visitor.visit_str(&v.to_string()),
DataType::Int(v) => visitor.visit_str(&v.to_string()),
DataType::Bool(v) => visitor.visit_str(&v.to_string()),
DataType::DateTime(v) => visitor.visit_str(&v.to_string()),
DataType::Error(ref err) => Err(DeError::CellError {
err: err.clone(),
pos: self.pos,
Expand Down Expand Up @@ -638,6 +640,7 @@ impl<'a, 'de> serde::Deserializer<'de> for DataTypeDeserializer<'a> {
DataType::Empty => visitor.visit_bool(false),
DataType::Float(v) => visitor.visit_bool(*v != 0.),
DataType::Int(v) => visitor.visit_bool(*v != 0),
DataType::DateTime(v) => visitor.visit_bool(*v != 0.),
DataType::Error(ref err) => Err(DeError::CellError {
err: err.clone(),
pos: self.pos,
Expand Down
180 changes: 170 additions & 10 deletions src/xlsx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,12 @@ impl FromStr for CellErrorType {
}
}

#[derive(Debug)]
enum CellFormat {
Other,
Date,
}

/// A struct representing xml zipped excel file
/// Xlsx, Xlsm, Xlam
pub struct Xlsx<RS>
Expand All @@ -145,6 +151,8 @@ where
strings: Vec<String>,
/// Sheets paths
sheets: Vec<(String, String)>,
/// Cell (number) formats
formats: Vec<CellFormat>,
/// Metadata
metadata: Metadata,
}
Expand Down Expand Up @@ -173,6 +181,82 @@ impl<RS: Read + Seek> Xlsx<RS> {
Ok(())
}

fn read_styles(&mut self) -> Result<(), XlsxError> {
let mut xml = match xml_reader(&mut self.zip, "xl/styles.xml") {
None => return Ok(()),
Some(x) => x?,
};

let mut number_formats = HashMap::new();

let mut buf = Vec::new();
let mut inner_buf = Vec::new();
loop {
buf.clear();
match xml.read_event(&mut buf) {
Ok(Event::Start(ref e)) if e.local_name() == b"numFmts" => loop {
inner_buf.clear();
match xml.read_event(&mut inner_buf) {
Ok(Event::Start(ref e)) if e.local_name() == b"numFmt" => {
let mut id = Vec::new();
let mut format = String::new();
for a in e.attributes() {
match a? {
Attribute {
key: b"numFmtId",
value: v,
} => id.extend_from_slice(&v),
Attribute {
key: b"formatCode",
value: v,
} => format = xml.decode(&v).into_owned(),
_ => (),
}
}
number_formats.insert(id, format);
}
Ok(Event::End(ref e)) if e.local_name() == b"numFmts" => break,
Ok(Event::Eof) => return Err(XlsxError::XmlEof("numFmts")),
Err(e) => return Err(XlsxError::Xml(e)),
_ => (),
}
},
Ok(Event::Start(ref e)) if e.local_name() == b"cellXfs" => loop {
inner_buf.clear();
match xml.read_event(&mut inner_buf) {
Ok(Event::Start(ref e)) if e.local_name() == b"xf" => {
self.formats.push(
e.attributes()
.filter_map(|a| a.ok())
.find(|a| a.key == b"numFmtId")
.map_or(CellFormat::Other, |a| {
match number_formats.get(&*a.value) {
Some(fmt) if is_custom_date_format(fmt) => {
CellFormat::Date
}
None if is_builtin_date_format_id(&a.value) => {
CellFormat::Date
}
_ => CellFormat::Other,
}
}),
);
}
Ok(Event::End(ref e)) if e.local_name() == b"cellXfs" => break,
Ok(Event::Eof) => return Err(XlsxError::XmlEof("cellXfs")),
Err(e) => return Err(XlsxError::Xml(e)),
_ => (),
}
},
Ok(Event::End(ref e)) if e.local_name() == b"styleSheet" => break,
Ok(Event::Eof) => return Err(XlsxError::XmlEof("styleSheet")),
Err(e) => return Err(XlsxError::Xml(e)),
_ => (),
}
}
Ok(())
}

fn read_workbook(&mut self, relationships: &HashMap<Vec<u8>, String>) -> Result<(), XlsxError> {
let mut xml = match xml_reader(&mut self.zip, "xl/workbook.xml") {
None => return Ok(()),
Expand Down Expand Up @@ -280,12 +364,18 @@ impl<RS: Read + Seek> Xlsx<RS> {

fn worksheet<T, F>(
strings: &[String],
formats: &[CellFormat],
mut xml: XlsReader<'_>,
read_data: &mut F,
) -> Result<Range<T>, XlsxError>
where
T: Default + Clone + PartialEq,
F: FnMut(&[String], &mut XlsReader<'_>, &mut Vec<Cell<T>>) -> Result<(), XlsxError>,
F: FnMut(
&[String],
&[CellFormat],
&mut XlsReader<'_>,
&mut Vec<Cell<T>>,
) -> Result<(), XlsxError>,
{
let mut cells = Vec::new();
let mut buf = Vec::new();
Expand Down Expand Up @@ -313,7 +403,7 @@ where
return Err(XlsxError::UnexpectedNode("dimension"));
}
b"sheetData" => {
read_data(&strings, &mut xml, &mut cells)?;
read_data(&strings, &formats, &mut xml, &mut cells)?;
break;
}
_ => (),
Expand All @@ -338,10 +428,12 @@ impl<RS: Read + Seek> Reader for Xlsx<RS> {
let mut xlsx = Xlsx {
zip: ZipArchive::new(reader)?,
strings: Vec::new(),
formats: Vec::new(),
sheets: Vec::new(),
metadata: Metadata::default(),
};
xlsx.read_shared_strings()?;
xlsx.read_styles()?;
let relationships = xlsx.read_relationships()?;
xlsx.read_workbook(&relationships)?;
Ok(xlsx)
Expand All @@ -366,9 +458,10 @@ impl<RS: Read + Seek> Reader for Xlsx<RS> {
None => return None,
};
let strings = &self.strings;
let formats = &self.formats;
xml.map(|xml| {
worksheet(strings, xml?, &mut |s, xml, cells| {
read_sheet_data(xml, s, cells)
worksheet(strings, formats, xml?, &mut |s, f, xml, cells| {
read_sheet_data(xml, s, f, cells)
})
})
}
Expand All @@ -380,8 +473,9 @@ impl<RS: Read + Seek> Reader for Xlsx<RS> {
};

let strings = &self.strings;
let formats = &self.formats;
xml.map(|xml| {
worksheet(strings, xml?, &mut |_, xml, cells| {
worksheet(strings, formats, xml?, &mut |_, _, xml, cells| {
read_sheet(xml, cells, &mut |cells, xml, e, pos, _| {
match e.local_name() {
b"is" | b"v" => xml.read_to_end(e.name(), &mut Vec::new())?,
Expand Down Expand Up @@ -483,15 +577,25 @@ where
fn read_sheet_data(
xml: &mut XlsReader<'_>,
strings: &[String],
formats: &[CellFormat],
cells: &mut Vec<Cell<DataType>>,
) -> Result<(), XlsxError> {
/// read the contents of a <v> cell
fn read_value<'a>(
v: String,
strings: &[String],
atts: Attributes<'a>,
formats: &[CellFormat],
c_element: &BytesStart<'a>,
) -> Result<DataType, XlsxError> {
match get_attribute(atts, b"t")? {
let is_date_time = match get_attribute(c_element.attributes(), b"s") {
Ok(Some(style)) => {
let id: usize = std::str::from_utf8(style).unwrap_or("0").parse()?;
matches!(formats.get(id), Some(CellFormat::Date))
}
_ => false,
};

match get_attribute(c_element.attributes(), b"t")? {
Some(b"s") => {
// shared string
let idx: usize = v.parse()?;
Expand Down Expand Up @@ -530,14 +634,26 @@ fn read_sheet_data(
Some(b"n") => {
// n - number
v.parse()
.map(DataType::Float)
.map(|n| {
if is_date_time {
DataType::DateTime(n)
} else {
DataType::Float(n)
}
})
.map_err(XlsxError::ParseFloat)
}
None => {
// If type is not known, we try to parse as Float for utility, but fall back to
// String if this fails.
v.parse()
.map(DataType::Float)
.map(|n| {
if is_date_time {
DataType::DateTime(n)
} else {
DataType::Float(n)
}
})
.map_err(XlsxError::ParseFloat)
.or_else::<XlsxError, _>(|_| Ok(DataType::String(v)))
}
Expand Down Expand Up @@ -566,7 +682,7 @@ fn read_sheet_data(
b"v" => {
// value
let v = xml.read_text(e.name(), &mut Vec::new())?;
match read_value(v, strings, c_element.attributes())? {
match read_value(v, strings, formats, c_element)? {
DataType::Empty => (),
v => cells.push(Cell::new(pos, v)),
}
Expand All @@ -578,6 +694,50 @@ fn read_sheet_data(
})
}

// This tries to detect number formats that are definitely date/time formats.
// This is definitely not perfect!
fn is_custom_date_format(format: &String) -> bool {
for chr in format.bytes() {
match chr {
b'm' | b'd' | b'y' | b'M' | b'D' | b'Y' | b'h' | b's' | b'H' | b'S' | b'-' | b'/'
| b'.' | b' ' | b'\\' => (),
_ => return false,
}
}

return true;
}

fn is_builtin_date_format_id(id: &[u8]) -> bool {
match id {
// mm-dd-yy
b"14" |
// d-mmm-yy
b"15" |
// d-mmm
b"16" |
// mmm-yy
b"17" |
// h:mm AM/PM
b"18" |
// h:mm:ss AM/PM
b"19" |
// h:mm
b"20" |
// h:mm:ss
b"21" |
// m/d/yy h:mm
b"22" |
// mm:ss
b"45" |
// [h]:mm:ss
b"46" |
// mmss.0
b"47" => true,
_ => false
}
}

#[derive(Debug, PartialEq)]
struct Dimensions {
start: (u32, u32),
Expand Down
Binary file added tests/date.xlsx
Binary file not shown.
19 changes: 18 additions & 1 deletion tests/test.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use calamine::CellErrorType::*;
use calamine::DataType::{Bool, Empty, Error, Float, String};
use calamine::DataType::{Bool, DateTime, Empty, Error, Float, String};
use calamine::{open_workbook, open_workbook_auto, Ods, Reader, Xls, Xlsb, Xlsx};
use std::io::Cursor;
use std::sync::Once;
Expand Down Expand Up @@ -675,3 +675,20 @@ fn issue_174() {
let mut xls: Xlsx<_> = open_workbook(&path).unwrap();
xls.worksheet_range_at(0).unwrap().unwrap();
}

#[test]
fn date() {
setup();

let path = format!("{}/tests/date.xlsx", env!("CARGO_MANIFEST_DIR"));
let mut xls: Xlsx<_> = open_workbook(&path).unwrap();
let range = xls.worksheet_range_at(0).unwrap().unwrap();

assert_eq!(range.get_value((0, 0)), Some(&DateTime(44197.0)));

#[cfg(feature = "dates")]
{
let date = chrono::NaiveDate::from_ymd(2021, 01, 01);
assert_eq!(range.get_value((0, 0)).unwrap().as_date(), Some(date));
}
}