Skip to content

Commit

Permalink
standard cleaning process
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewallenbruce committed Aug 2, 2024
1 parent 7dde4b1 commit 243e077
Show file tree
Hide file tree
Showing 16 changed files with 133 additions and 75 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ Imports:
strex,
stringr,
tictoc,
magrittr,
tidyr,
zip
Suggests:
fuimus,
magrittr,
roxyglobals,
testthat (>= 3.0.0)
Remotes:
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

export("%>%")
export(ask)
export(clean_credentials)
export(create_zip_file_names)
export(dispense)
export(download_zips)
Expand Down
2 changes: 0 additions & 2 deletions R/globals.R
Original file line number Diff line number Diff line change
Expand Up @@ -743,8 +743,6 @@ utils::globalVariables(c(
# <peek>
# <prune>
"uncompressed_size",
# <create_zip_file_names>
"zip_paths",
# <peek>
"zipfile",
NULL
Expand Down
29 changes: 20 additions & 9 deletions R/nber.R
Original file line number Diff line number Diff line change
Expand Up @@ -149,22 +149,33 @@ download_zips <- function(table, directory) {
#' @export
create_zip_file_names <- function(x){

stopifnot(
!all(
fs::is_absolute_path(zip_paths)
) == "All paths must be absolute paths."
)

basename(zip_paths) |>
stringr::str_remove_all(".zip|week") |>
basename(x) |>
stringr::str_remove_all(".zip|week|npidata_pfile_") |>
strex::str_split_by_numbers() |>
purrr::list_transpose() |>
purrr::discard_at(2) |>
purrr::set_names(c("start", "end")) |>
purrr::map(lubridate::mdy) |>
purrr::map(anytime::anydate) |>
purrr::list_transpose() |>
purrr::map(paste0, collapse = "|") |>
purrr::map(yasp::wrap, left = "week:", right = "") |>
unlist(use.names = FALSE)

}

#' Clean credentials
#'
#' Replaces periods with empty strings
#'
#' @param x a vector of provider credentials
#'
#' @returns vector
#'
#' @autoglobal
#'
#' @keywords internal
#'
#' @export
clean_credentials <- function(x) {
gsub("\\.", "", x)
}
49 changes: 49 additions & 0 deletions data-raw/nber_download.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,52 @@
# monthly = NULL,
# byvar = NULL
# )


get_pin("nber_weekly_info")$unzipped[1]

npi_2024_01_01 <- get_pin("npi_2024_01_01__2024_01_07")
npi_2024_01_22 <- get_pin("npi_2024_01_22__2024_01_28")


release <- create_zip_file_names(c(npi_2024_01_01$release, npi_2024_01_22$release))

npi_2024_01_01 <- vctrs::vec_cbind(
dplyr::tibble(release = release[1]),
npi_2024_01_01[["base"]]
)

npi_2024_01_22 <- vctrs::vec_cbind(
dplyr::tibble(release = release[2]),
npi_2024_01_22[["base"]]
)

# fuimus::create_vec(names(npi_2024_01_01))

clnm <- c(
"entity",
"enum_date",
"deact_date",
"react_date",
"sole_prop",
"org_sub",
"gender",
"credential")

npi_2024_01_01 |>
dplyr::mutate(
credential = provider:::clean_credentials(credential)) |>
hacksaw::count_split(
entity,
enum_date,
deact_date,
react_date,
sole_prop,
org_sub,
gender,
credential
) |>
purrr::map(\(df) dplyr::filter(df, !is.na(df[1]))) |>
purrr::map(\(df) dplyr::rename(df, val = names(df[1]))) |>
purrr::set_names(clnm) |>
purrr::list_rbind(names_to = "var")
57 changes: 20 additions & 37 deletions data-raw/nber_taxonomy_license.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ get_pin("nber_weekly_info")
#----------- NPIData Base ####
npidata <- dplyr::filter(get_pin("nber_weekly_info")$unzipped, file == "npidata_pfile")

release_id <- tools::file_path_sans_ext(basename(npidata$path[1]))
release_id <- tools::file_path_sans_ext(basename(npidata$path[2]))
release_id

npi_raw <- tidytable::fread(
npidata$path[1],
npidata$path[2],
colClasses = list(character = 1:330)) |>
janitor::clean_names() |>
fuimus::remove_quiet() |>
Expand Down Expand Up @@ -38,7 +38,7 @@ npi_base <- npi_raw |>
middle = provider_middle_name,
last = provider_last_name_legal_name,
suffix = provider_name_suffix_text,
credential = provider_credential_text
credential = clean_credentials(provider_credential_text)
) |>
fuimus::remove_quiet()

Expand All @@ -57,37 +57,25 @@ npi_other <- npi_raw |>
other_last = provider_other_last_name,
other_last_type = provider_other_last_name_type_code,
other_suffix = provider_other_name_suffix_text,
other_credential = provider_other_credential_text
) |>
dplyr::rowwise() |>
dplyr::mutate(na_count = list(sum(is.na(dplyr::c_across(other_org_name:other_credential))))) |>
tidyr::unnest(na_count) |>
dplyr::filter(na_count < 9) |>
dplyr::select(-na_count) |>
fuimus::remove_quiet()
other_credential = clean_credentials(provider_other_credential_text)
)

npi_other
npi_other <- vctrs::vec_slice(npi_other, which(cheapr::row_na_counts(npi_other) < 9))

npi_authorized_official <- npi_raw |>
npi_auth_ofc <- npi_raw |>
dplyr::reframe(
npi,
ao_prefix = authorized_official_name_prefix_text,
ao_first = authorized_official_first_name,
ao_middle = authorized_official_middle_name,
ao_last = authorized_official_last_name,
ao_suffix = authorized_official_name_suffix_text,
ao_credential = authorized_official_credential_text,
ao_credential = clean_credentials(authorized_official_credential_text),
ao_title = authorized_official_title_or_position,
ao_phone = authorized_official_telephone_number
) |>
dplyr::rowwise() |>
dplyr::mutate(na_count = list(sum(is.na(dplyr::c_across(ao_prefix:ao_phone))))) |>
tidyr::unnest(na_count) |>
dplyr::filter(na_count < 8) |>
dplyr::select(-na_count) |>
fuimus::remove_quiet()
)

npi_authorized_official
npi_auth_ofc <- vctrs::vec_slice(npi_auth_ofc, which(cheapr::row_na_counts(npi_auth_ofc) < 8))

npi_address <- npi_raw |>
dplyr::reframe(
Expand All @@ -114,9 +102,8 @@ npi_address <- npi_raw |>
purrr::map_dfr(fuimus::na_if_common) |>
fuimus::remove_quiet()

npi_address

#----------- NPIData Taxonomy/License ####

cols_pattern <- fuimus::single_line_string("
healthcare_provider_taxonomy_code
|provider_license_number
Expand All @@ -125,7 +112,7 @@ healthcare_provider_taxonomy_code
|healthcare_provider_taxonomy_group"
)

npi_taxonomy_license <- npi_raw |>
npi_tax_lis <- npi_raw |>
dplyr::select(npi, dplyr::matches(rlang::as_string(cols_pattern))) |>
fuimus::remove_quiet() |>
dplyr::mutate(row_id = dplyr::row_number(), .before = 1) |>
Expand Down Expand Up @@ -157,8 +144,6 @@ npi_taxonomy_license <- npi_raw |>
license_state) |>
dplyr::arrange(npi, taxonomy_primary)

npi_taxonomy_license |>
tidyr::nest(data = -npi)

#----------- NPIData Other Identifiers ####
npi_identifiers <- npi_raw |>
Expand Down Expand Up @@ -191,21 +176,19 @@ npi_identifiers <- npi_raw |>
other_id_issuer
)

npi_identifiers

#----------- Weekly Release pin ####
npi_week <- list(
release = release_id,
base = npi_base,
addr = npi_address,
release = create_zip_file_names(release_id),
basic = npi_base,
address = npi_address,
other = npi_other,
ao = npi_authorized_official,
tax = npi_taxonomy_license,
ids = npi_identifiers
authorized = npi_auth_ofc,
taxonomy = npi_tax_lis,
identifier = npi_identifiers
)

pin_update(
x = npi_week,
name = "npi_2024_01_01__2024_01_07",
title = "NBER NPI Weekly Release 01-01-24"
name = "2024-01-22_2024-01-28",
title = "NBER NPI Weekly Release 2024-01-22"
)
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
file: 2024-01-22_2024-01-28.qs
file_size: 2247579
pin_hash: fa95b74bd411aa78
type: qs
title: NBER NPI Weekly Release 2024-01-22
description: ~
tags: ~
urls: ~
created: 20240802T063427Z
api_version: 1
10 changes: 4 additions & 6 deletions inst/extdata/pins/_pins.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
2024-01-22_2024-01-28:
- 2024-01-22_2024-01-28/20240802T063427Z-fa95b/
nber_weekly_info:
- nber_weekly_info/20240801T132355Z-6664b/
npi_2024_01_01__2024_01_07:
- npi_2024_01_01__2024_01_07/20240802T022458Z-f6dd7/
npi_2024_01_22__2024_01_28:
- npi_2024_01_22__2024_01_28/20240802T021816Z-9c4a6/
npi_wk_040124:
- npi_wk_040124/20240801T200711Z-f0069/
week_2024-01-01_2024-01-07:
- week_2024-01-01_2024-01-07/20240802T063048Z-a6c21/

This file was deleted.

Binary file not shown.

This file was deleted.

Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
file: week_2024-01-01_2024-01-07.qs
file_size: 1754940
pin_hash: a6c21a95965b6788
type: qs
title: NBER NPI Weekly Release 2024-01-01
description: ~
tags: ~
urls: ~
created: 20240802T063048Z
api_version: 1
Binary file not shown.
18 changes: 18 additions & 0 deletions man/clean_credentials.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 243e077

Please sign in to comment.