@@ -142,11 +142,11 @@ cleanepi::scan_data(raw_ebola_data)
142142
143143``` output
144144 Field_names missing numeric date character logical
145- 1 age 0.0646 0.8348 0.0000 0.1006 0
146- 2 gender 0.1578 0.0472 0.0000 0.7950 0
147- 3 status 0.0535 0.0000 0.0000 0.9465 0
148- 4 date onset 0.0001 0.0000 0.9159 0.0840 0
149- 5 date sample 0.0001 0.0000 0.9999 0.0000 0
145+ 1 age 0.0690 0.8925 0.0000 0.1075 0
146+ 2 gender 0.1874 0.0560 0.0000 0.9440 0
147+ 3 status 0.0565 0.0000 0.0000 1.0000 0
148+ 4 date onset 0.0001 0.0000 0.9159 0.0841 0
149+ 5 date sample 0.0001 0.0000 1.0000 0.0000 0
1501506 region 0.0000 0.0000 0.0000 1.0000 0
151151```
152152
@@ -218,7 +218,7 @@ sim_ebola_data <- cleanepi::remove_duplicates(sim_ebola_data)
218218
219219``` output
220220! Found 5 duplicated rows in the dataset.
221- ℹ Use `attr (dat, "report")[["duplicated_rows"]] ` to access them, where "dat" is
221+ ℹ Use `print_report (dat, "found_duplicates") ` to access them, where "dat" is
222222 the object used to store the output from this operation.
223223```
224224
@@ -294,9 +294,8 @@ df %>%
294294
295295``` output
296296! Constant data was removed after 2 iterations.
297- ℹ Enter `attr(dat, "report")[["constant_data"]]` for more information, where
298- "dat" represents the object used to store the output from
299- `remove_constants()`.
297+ ℹ Enter `print_report(dat, "constant_data")` for more information, where "dat"
298+ represents the object used to store the output from `remove_constants()`.
300299```
301300
302301``` output
@@ -315,9 +314,8 @@ df %>%
315314
316315``` output
317316! Constant data was removed after 2 iterations.
318- ℹ Enter `attr(dat, "report")[["constant_data"]]` for more information, where
319- "dat" represents the object used to store the output from
320- `remove_constants()`.
317+ ℹ Enter `print_report(dat, "constant_data")` for more information, where "dat"
318+ represents the object used to store the output from `remove_constants()`.
321319```
322320
323321``` output
@@ -348,19 +346,19 @@ sim_ebola_data
348346```
349347
350348``` output
351- # A tibble: 15,000 × 8
352- v1 case_id age gender status date_onset date_sample row_id
353- <int> <int> <chr> <chr> <chr> <chr> <chr> <int>
354- 1 1 14905 90 1 confirmed 03/15/2015 06/04/2015 1
355- 2 2 13043 twenty-five 2 <NA> Sep /11/13 03/01/2014 2
356- 3 3 14364 54 f <NA> 09/02/2014 03/03/2015 3
357- 4 4 14675 ninety <NA> <NA> 10/19/2014 31/ 12 /14 4
358- 5 5 12648 74 F <NA> 08/06/2014 10/10/2016 5
359- 6 6 14274 seventy-six female <NA> Apr /05/15 01/23/2016 7
360- 7 7 14132 sixteen male confirmed Dec /29/Y 05/10/2015 8
361- 8 8 14715 44 f confirmed Apr /06/Y 04/24/2016 9
362- 9 9 13435 26 1 <NA> 09/07/2014 20/ 09 /14 10
363- 10 10 14816 thirty f <NA> 06/29/2015 06/02/2015 11
349+ # A tibble: 15,000 × 7
350+ v1 case_id age gender status date_onset date_sample
351+ <int> <int> <chr> <chr> <chr> <chr> <chr>
352+ 1 1 14905 90 1 confirmed 03/15/2015 06/04/2015
353+ 2 2 13043 twenty-five 2 <NA> sep /11/13 03/01/2014
354+ 3 3 14364 54 f <NA> 09/02/2014 03/03/2015
355+ 4 4 14675 ninety <NA> <NA> 10/19/2014 31/ 12 /14
356+ 5 5 12648 74 F <NA> 08/06/2014 10/10/2016
357+ 6 6 14274 seventy-six female <NA> apr /05/15 01/23/2016
358+ 7 7 14132 sixteen male confirmed dec /29/y 05/10/2015
359+ 8 8 14715 44 f confirmed apr /06/y 04/24/2016
360+ 9 9 13435 26 1 <NA> 09/07/2014 20/ 09 /14
361+ 10 10 14816 thirty f <NA> 06/29/2015 06/02/2015
364362# ℹ 14,990 more rows
365363```
366364
@@ -386,10 +384,10 @@ sim_ebola_data <-
386384```
387385
388386``` output
389- ! Found 1957 duplicated values in the subject Ids .
390- ℹ Enter `attr( dat, "report")[["duplicated_rows"]] ` to access them, where "dat"
391- is the object used to store the output from this operation.
392- ℹ No incorrect subject id was detected .
387+ ! Detected 0 missing, 1957 duplicated, and 0 incorrect subject IDs .
388+ ℹ Enter `print_report(data = dat, "incorrect_subject_id") ` to access them,
389+ where "dat" is the object used to store the output from this operation.
390+ ℹ You can use the `correct_subject_ids()` function to correct them .
393391```
394392
395393Note that our simulated dataset does contain duplicated subject IDS.
@@ -429,24 +427,33 @@ sim_ebola_data <- cleanepi::standardize_dates(
429427 " date_sample"
430428 )
431429)
430+ ```
431+
432+ ``` output
433+ ! Detected 1142 values that comply with multiple formats and no values that are
434+ outside of the specified time frame.
435+ ℹ Enter `print_report(data = dat, "date_standardization")` to access them,
436+ where "dat" is the object used to store the output from this operation.
437+ ```
432438
439+ ``` r
433440sim_ebola_data
434441```
435442
436443``` output
437- # A tibble: 15,000 × 8
438- v1 case_id age gender status date_onset date_sample row_id
439- <int> <chr> <chr> <chr> <chr> <date> <date> <int>
440- 1 1 14905 90 1 confirmed 2015-03-15 2015-06-04 1
441- 2 2 13043 twenty-five 2 <NA> 2013-09-11 2014-03-01 2
442- 3 3 14364 54 f <NA> 2014-09-02 2015-03-03 3
443- 4 4 14675 ninety <NA> <NA> 2014-10-19 2031 -12-14 4
444- 5 5 12648 74 F <NA> 2014-08-06 2016-10-10 5
445- 6 6 14274 seventy-six female <NA> 2015-04-05 2016-01-23 7
446- 7 7 14132 sixteen male confirmed NA 2015-05-10 8
447- 8 8 14715 44 f confirmed NA 2016-04-24 9
448- 9 9 13435 26 1 <NA> 2014-09-07 2020 -09-14 10
449- 10 10 14816 thirty f <NA> 2015-06-29 2015-06-02 11
444+ # A tibble: 15,000 × 7
445+ v1 case_id age gender status date_onset date_sample
446+ <int> <chr> <chr> <chr> <chr> <date> <date>
447+ 1 1 14905 90 1 confirmed 2015-03-15 2015-04-06
448+ 2 2 13043 twenty-five 2 <NA> 2013-09-11 2014-01-03
449+ 3 3 14364 54 f <NA> 2014-02-09 2015-03-03
450+ 4 4 14675 ninety <NA> <NA> 2014-10-19 2014 -12-31
451+ 5 5 12648 74 F <NA> 2014-06-08 2016-10-10
452+ 6 6 14274 seventy-six female <NA> 2015-04-05 2016-01-23
453+ 7 7 14132 sixteen male confirmed NA 2015-10-05
454+ 8 8 14715 44 f confirmed NA 2016-04-24
455+ 9 9 13435 26 1 <NA> 2014-07-09 2014 -09-20
456+ 10 10 14816 thirty f <NA> 2015-06-29 2015-02-06
450457# ℹ 14,990 more rows
451458```
452459
@@ -481,19 +488,19 @@ sim_ebola_data
481488```
482489
483490``` output
484- # A tibble: 15,000 × 8
485- v1 case_id age gender status date_onset date_sample row_id
486- <int> <chr> <dbl> <chr> <chr> <date> <date> <int>
487- 1 1 14905 90 1 confirmed 2015-03-15 2015-06-04 1
488- 2 2 13043 25 2 <NA> 2013-09-11 2014-03-01 2
489- 3 3 14364 54 f <NA> 2014-09-02 2015-03-03 3
490- 4 4 14675 90 <NA> <NA> 2014-10-19 2031 -12-14 4
491- 5 5 12648 74 F <NA> 2014-08-06 2016-10-10 5
492- 6 6 14274 76 female <NA> 2015-04-05 2016-01-23 7
493- 7 7 14132 16 male confirmed NA 2015-05-10 8
494- 8 8 14715 44 f confirmed NA 2016-04-24 9
495- 9 9 13435 26 1 <NA> 2014-09-07 2020 -09-14 10
496- 10 10 14816 30 f <NA> 2015-06-29 2015-06-02 11
491+ # A tibble: 15,000 × 7
492+ v1 case_id age gender status date_onset date_sample
493+ <int> <chr> <dbl> <chr> <chr> <date> <date>
494+ 1 1 14905 90 1 confirmed 2015-03-15 2015-04-06
495+ 2 2 13043 25 2 <NA> 2013-09-11 2014-01-03
496+ 3 3 14364 54 f <NA> 2014-02-09 2015-03-03
497+ 4 4 14675 90 <NA> <NA> 2014-10-19 2014 -12-31
498+ 5 5 12648 74 F <NA> 2014-06-08 2016-10-10
499+ 6 6 14274 76 female <NA> 2015-04-05 2016-01-23
500+ 7 7 14132 16 male confirmed NA 2015-10-05
501+ 8 8 14715 44 f confirmed NA 2016-04-24
502+ 9 9 13435 26 1 <NA> 2014-07-09 2014 -09-20
503+ 10 10 14816 30 f <NA> 2015-06-29 2015-02-06
497504# ℹ 14,990 more rows
498505```
499506
@@ -529,9 +536,13 @@ cleanepi::check_date_sequence(
529536```
530537
531538``` output
532- ! Detected 16 incorrect date sequences at lines: "10, 20, 22, 26, 29, 44, 46,
533- 54, 60, 63, 70, 71, 73, 80, 81, 90".
534- ℹ Enter `attr(dat, "report")[["incorrect_date_sequence"]]` to access them,
539+ ℹ Cannot check the sequence of date events across 37 rows due to missing data.
540+ ```
541+
542+ ``` output
543+ ! Detected 24 incorrect date sequences at lines: "8, 15, 18, 20, 21, 23, 26,
544+ 28, 29, 32, 34, 35, 37, 38, 40, 43, 46, 49, 52, 54, 56, 58, 60, 63".
545+ ℹ Enter `print_report(data = dat, "incorrect_date_sequence")` to access them,
535546 where "dat" is the object used to store the output from this operation.
536547```
537548
@@ -586,19 +597,19 @@ sim_ebola_data
586597```
587598
588599``` output
589- # A tibble: 15,000 × 8
590- v1 case_id age gender status date_onset date_sample row_id
591- <int> <chr> <dbl> <chr> <chr> <date> <date> <int>
592- 1 1 14905 90 male confirmed 2015-03-15 2015-06-04 1
593- 2 2 13043 25 female <NA> 2013-09-11 2014-03-01 2
594- 3 3 14364 54 female <NA> 2014-09-02 2015-03-03 3
595- 4 4 14675 90 <NA> <NA> 2014-10-19 2031 -12-14 4
596- 5 5 12648 74 female <NA> 2014-08-06 2016-10-10 5
597- 6 6 14274 76 female <NA> 2015-04-05 2016-01-23 7
598- 7 7 14132 16 male confirmed NA 2015-05-10 8
599- 8 8 14715 44 female confirmed NA 2016-04-24 9
600- 9 9 13435 26 male <NA> 2014-09-07 2020 -09-14 10
601- 10 10 14816 30 female <NA> 2015-06-29 2015-06-02 11
600+ # A tibble: 15,000 × 7
601+ v1 case_id age gender status date_onset date_sample
602+ <int> <chr> <dbl> <chr> <chr> <date> <date>
603+ 1 1 14905 90 male confirmed 2015-03-15 2015-04-06
604+ 2 2 13043 25 female <NA> 2013-09-11 2014-01-03
605+ 3 3 14364 54 female <NA> 2014-02-09 2015-03-03
606+ 4 4 14675 90 <NA> <NA> 2014-10-19 2014 -12-31
607+ 5 5 12648 74 female <NA> 2014-06-08 2016-10-10
608+ 6 6 14274 76 female <NA> 2015-04-05 2016-01-23
609+ 7 7 14132 16 male confirmed NA 2015-10-05
610+ 8 8 14715 44 female confirmed NA 2016-04-24
611+ 9 9 13435 26 male <NA> 2014-07-09 2014 -09-20
612+ 10 10 14816 30 female <NA> 2015-06-29 2015-02-06
602613# ℹ 14,990 more rows
603614```
604615
@@ -680,16 +691,16 @@ sim_ebola_data %>%
680691# A tibble: 15,000 × 4
681692 case_id date_sample years_since_collection remainder_months
682693 <chr> <date> <dbl> <dbl>
683- 1 14905 2015-06-04 9 7
684- 2 13043 2014-03-01 10 10
694+ 1 14905 2015-04-06 9 8
695+ 2 13043 2014-01-03 11 0
685696 3 14364 2015-03-03 9 10
686- 4 14675 2031 -12-14 -6 -11
697+ 4 14675 2014 -12-31 10 0
687698 5 12648 2016-10-10 8 2
688699 6 14274 2016-01-23 8 11
689- 7 14132 2015-05-10 9 7
700+ 7 14132 2015-10-05 9 2
690701 8 14715 2016-04-24 8 8
691- 9 13435 2020 -09-14 4 3
692- 10 14816 2015-06-02 9 7
702+ 9 13435 2014 -09-20 10 3
703+ 10 14816 2015-02-06 9 10
693704# ℹ 14,990 more rows
694705```
695706
@@ -751,6 +762,10 @@ dat_clean <- dat %>%
751762```
752763
753764``` output
765+ ! Detected 4 values that comply with multiple formats and no values that are
766+ outside of the specified time frame.
767+ ℹ Enter `print_report(data = dat, "date_standardization")` to access them,
768+ where "dat" is the object used to store the output from this operation.
754769! Found <numeric> values that could also be of type <Date> in column:
755770 date_of_birth.
756771ℹ It is possible to convert them into <Date> using: `lubridate::as_date(x,
@@ -793,16 +808,16 @@ dat_clean %>%
793808# A tibble: 10 × 6
794809 study_id sex date_first_pcr_posit…¹ date_of_birth age_in_years age_category
795810 <chr> <int> <date> <date> <dbl> <fct>
796- 1 PS001P2 1 2020-12-01 1972-06-01 52 [35,60)
811+ 1 PS001P2 1 2020-12-01 1972-01-06 53 [35,60)
797812 2 PS002P2 1 2021-01-01 1952-02-20 73 [60,Inf]
798813 3 PS004P2… NA 2021-02-11 1961-06-15 63 [60,Inf]
799814 4 PS003P2 1 2021-02-01 1947-11-11 77 [60,Inf]
800815 5 P0005P2 2 2021-02-16 2000-09-26 24 [20,35)
801816 6 PS006P2 2 2021-05-02 NA NA <NA>
802- 7 PB500P2 1 2021-02-19 1989-11-03 35 [35,60)
803- 8 PS008P2 2 2021-09-20 1976-10-05 48 [35,60)
817+ 7 PB500P2 1 2021-02-19 1989-03-11 35 [35,60)
818+ 8 PS008P2 2 2021-09-20 1976-05-10 48 [35,60)
804819 9 PS010P2 1 2021-02-26 1991-09-23 33 [20,35)
805- 10 PS011P2 2 2021-03-03 1991-02-08 34 [20,35)
820+ 10 PS011P2 2 2021-03-03 1991-08-02 33 [20,35)
806821# ℹ abbreviated name: ¹date_first_pcr_positive_test
807822```
808823
0 commit comments