Skip to content

Commit a4c4404

Browse files
committed
richer consume_time validator
1 parent 844ef64 commit a4c4404

File tree

4 files changed

+72
-5
lines changed

4 files changed

+72
-5
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ The supported types are inspired by - but not a perfect match to - BigQuery data
5252
- `INT64` - a json number without any exponent, between int64 min and max.
5353
- `DECIMAL_29_9` - a json number without an exponent, with up to 29 digits before the decimal point, and up to 9 after (aka BigQuery `NUMERIC`, with default decimal point position)
5454
- `DATE`* - date as a string, without a timezone, as `YYYY-MM-DD`, or `YYYY/MM/DD` or `YYYY.MM.DD`.
55-
- `TIME`* - time as a string, without a timezone, as `HH:MM:SS` (fractional seconds not supported currently)
55+
- `TIME`* - time as a string, without a timezone. `HH:MM[:SS[.SSSSSS]]`
5656
- `DATETIME`* - date and time as a string, without a timezone, as `YYYY-MM-DDTHH:MM::SS` (in the date part, `-` can be swapped for `/`, `.`).
5757
- `BYTES` - a base64 string.
5858
- `STRUCT` - a sub schema. In this case you need to provide a `"fields": [...]` property in the schema definition, with a list of sub fields. You can nest arbitrarily deeply, and/or use `REPEATED` mode if needed.

benches/micro_util_bench.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,25 @@ fn criterion_benchmark(c: &mut Criterion) {
492492
group.finish();
493493

494494

495+
496+
let mut group = c.benchmark_group("consume_time");
497+
// build 10k int/null values (we don't include valid json numbers that are invalid ints here)
498+
let mut values: Vec<u8pOwned> = Vec::with_capacity(10000);
499+
for _ in 0..values.capacity() {
500+
if rng.random_bool(0.1){
501+
values.push(u8pOwned::from("null"));
502+
} if rng.random_bool(0.4) {
503+
values.push(u8pOwned::from("\"23:11\" "));
504+
} if rng.random_bool(0.5) {
505+
values.push(u8pOwned::from("\"23:11:12.01\" "));
506+
} else {
507+
values.push(u8pOwned::from("\"23:11:12.0111\" "));
508+
}
509+
}
510+
let mut value_iter = values.iter().cycle();
511+
group.bench_function("simd", |b| b.iter(|| micro_util::consume_time( black_box(&value_iter.next().unwrap().as_borrowed()))));
512+
group.finish();
513+
495514
}
496515

497516
criterion_group!(benches, criterion_benchmark);

src/micro_util.rs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,33 @@ pub fn consume_decimal_29_9(json: &u8p) -> usize {
390390

391391
}
392392

393+
394+
395+
/// This assumes the json slice is the start of a spec-compliant value (not neccessarily a time string, but definitely compliant value).
396+
/// For a string of the form: `HH:MM[:SS[.SSSSSS]]` it returns the number of bytes, including the start and end double quotes, otherwise zero.
397+
pub fn consume_time(json: &u8p) -> usize {
398+
let lower = u8x16::from(*b"\"00:00:00.000000");
399+
let upper = u8x16::from(*b"\"29:59:59.999999");
400+
401+
let data = json.initial_lane::<16, 0>();
402+
let matched = data.simd_le(upper) & data.simd_ge(lower);
403+
let mut ret = matched.to_bitmask().trailing_ones() as usize;
404+
let next_char = json.raw_u8s().get(ret).unwrap_or(&0);
405+
ret += 1; // add closing quote
406+
407+
let check1= *next_char == b'"';
408+
let check2 = (ret == "\"00:00\"".len()) | (ret >= b"\"00:00:00\"".len());
409+
let check3 = (json.raw_u8s()[1] < b'2') | (json.raw_u8s()[2] <= b'3');
410+
411+
if check1 && check2 && check3 {
412+
return ret;
413+
} else {
414+
return 0;
415+
}
416+
}
417+
418+
419+
393420
/// Assumes json is valid, and is the start of a value of some kind. If it is the start of an array, it
394421
/// finds the matching end of the array, and similarlly for start/end of objects, returning the number of
395422
/// bytes needed to get to the end. For true/false, strings and numbers it will return the byte count (by making
@@ -682,6 +709,30 @@ mod tests {
682709
assert_eq!(consume_punct::<b'}', true>(&u8p!(b"}\t } ")), 3);
683710
}
684711

712+
#[test]
713+
fn test_consume_time(){
714+
assert_eq!(consume_time(&u8p!(b"null")), 0);
715+
assert_eq!(consume_time(&u8p!(b"true")), 0);
716+
assert_eq!(consume_time(&u8p!(b"false")), 0);
717+
assert_eq!(consume_time(&u8p!(b"1")), 0);
718+
assert_eq!(consume_time(&u8p!(b"-1")), 0);
719+
assert_eq!(consume_time(&u8p!(b"[false]")), 0);
720+
assert_eq!(consume_time(&u8p!(b"{false}")), 0);
721+
assert_eq!(consume_time(&u8p!(b"\"hello\"")), 0);
722+
723+
724+
assert_eq!(consume_time(&u8p!(b"\"12:45\" ")), 7);
725+
assert_eq!(consume_time(&u8p!(b"\"12:45:08\" ")), 10);
726+
assert_eq!(consume_time(&u8p!(b"\"23:45:08\" ")), 10);
727+
assert_eq!(consume_time(&u8p!(b"\"33:45:08\" ")), 0); // 33 hrs in a day..no
728+
assert_eq!(consume_time(&u8p!(b"\"24:45:08\" ")), 0); // 24 o'clock is the next day, so no
729+
730+
assert_eq!(consume_time(&u8p!(b"\"12:45:08.0123\" ")), 15);
731+
assert_eq!(consume_time(&u8p!(b"\"12:45:08.012345\" ")), 17);
732+
assert_eq!(consume_time(&u8p!(b"\"12:45:08.0123456\" ")), 0);
733+
assert_eq!(consume_time(&u8p!(b"\"12:45:08x0123\" ")), 0);
734+
}
735+
685736
#[test]
686737
fn test_consume_within_range() {
687738
pub const QUOTED_DATE_LOWER: &[u8; 12] = b"\"0000-00-00\"";

src/validate.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -206,10 +206,7 @@ pub fn validate<'a, 'b>(root_schema: &'a AdaptivePrefixMap<Field>, max_field_idx
206206
// missing: check = ((json[6] < '1') | (json[7] <= '2')) & (json[9] < '3') | (json[10] <= '1') ...but still allows invalid dates (do we need to deal with months, what about leap years?!)
207207
},
208208
FieldType::TIME => {
209-
const QUOTED_TIME_LOWER: &[u8; 10] = b"\"00:00:00\"";
210-
const QUOTED_TIME_UPPER: &[u8; 10] = b"\"29:59:59\"";
211-
micro_util::consume_within_range(&json_offset, QUOTED_TIME_LOWER, QUOTED_TIME_UPPER)
212-
// missing: check = (json[1] < '2') | (json[2] <= '3')
209+
micro_util::consume_time(&json_offset)
213210
},
214211
FieldType::DATETIME => {
215212
const QUOTED_DATETIME_LOWER: &[u8; 21] = b"\"0000-00-00T00:00:00\"";

0 commit comments

Comments
 (0)