-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdfTrials2Long.R
221 lines (183 loc) · 6.34 KB
/
dfTrials2Long.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
### ctrdata package
#' Convert data frame with trial records into long format
#'
#' The function works with procotol- and results- related information.
#' It converts lists and other values that are in a data frame returned
#' by \link{dbGetFieldsIntoDf} into individual rows of a long data frame.
#' From the resulting long data frame, values of interest can be selected
#' using \link{dfName2Value}.
#' The function is particularly useful for fields with complex content,
#' such as node field "\code{clinical_results}" from EUCTR, for which
#' \link{dbGetFieldsIntoDf} returns as a multiply nested list and for
#' which this function then converts every observation of every (leaf)
#' field into a row of its own.
#'
#' @param df Data frame (or tibble) with columns including
#' the trial identifier (\code{_id}) and
#' one or more variables as obtained from
#' \link{dbGetFieldsIntoDf}
#'
#' @return A data frame (or tibble, if \code{tibble} is loaded)
#' with the four columns: `_id`, `identifier`, `name`, `value`
#'
#' @importFrom stringi stri_extract_all_charclass stri_extract_first stri_replace_first
#' @importFrom xml2 xml_text read_html
#'
#' @export
#'
#' @examples
#'
#' dbc <- nodbi::src_sqlite(
#' dbname = system.file("extdata", "demo.sqlite", package = "ctrdata"),
#' collection = "my_trials",
#' RSQLite::SQLITE_RO)
#'
#' dfwide <- dbGetFieldsIntoDf(
#' fields = "clinical_results.participant_flow",
#' con = dbc)
#'
#' dfTrials2Long(df = dfwide)
#'
dfTrials2Long <- function(df) {
# get names
dfn <- names(df)
# check parameters
if (!any("_id" == dfn) ||
ncol(df) == 1L) stop(
"Missing _id column or other variables in 'df'",
call. = FALSE
)
if (any(c("identifier", "name", "value") %in% dfn)) stop(
"Unexpected columns; 'df' should not come from dfTrials2Long",
call. = FALSE
)
# make _id the first column
if (dfn[1] != "_id") {
dfn <- c("_id", dfn[dfn != "_id"])
df <- df[, dfn, drop = FALSE]
}
# helper function
flattenDf <- function(x) {
while (any(vapply(x, is.list, logical(1L)))) {
x <- lapply(x, function(x) if (is.list(x)) x else list(x))
x <- unlist(x, recursive = FALSE, use.names = TRUE)
}
return(x)
}
# columns that are not compatible with the
# later operations are converted to character
conv <- sapply(df, class) == "Date"
conv <- seq_len(ncol(df))[conv]
for (c in conv) df[, c] <- as.character(df[, c, drop = TRUE])
# protect numbers in names
dfn <- gsub("([0-9]+)", "#\\1#", dfn)
# get trial _id
id <- df[["_id"]]
# iterative unnesting, by column, by trial
out <- lapply(
seq_len(ncol(df))[-1],
function(cc) {
# get column name
tn <- dfn[cc]
# inform users
message(tn, rep(" ", 200L - nchar(tn)), "\r", appendLF = FALSE)
# get column as list,
# one item is one trial
ci <- df[[cc]]
# by trial (_id)
o <- lapply(seq_along(ci), function(ct) {
o <- unlist(flattenDf(ci[ct]))
o <- na.omit(o) # TODO
if (is.null(o) || !length(o)) return(NULL)
# check identifiers added by unlist
tst <- stringi::stri_extract_first_regex(names(o), "[0-9]+")
# construct new identifiers from column name and item name
tst[is.na(tst)] <- "0"
if (is.null(names(o))) {tnn <- paste0(tn, ".0")
} else {tnn <- paste0(
tn, ".", tst, ".", stringi::stri_replace_first_regex(
names(o), "[0-9]+", ""))
}
# construct tall df by _id by col
data.frame(
"_id" = id[ct],
"name" = tnn,
"value" = o,
check.names = FALSE,
stringsAsFactors = FALSE,
row.names = NULL)
})
# bind trials within column
as.data.frame(do.call(rbind, o), stringsAsFactors = FALSE)
}
)
# bind list items (were columns) into long df
out <- do.call(rbind, c(out, stringsAsFactors = FALSE))
message(rep(" ", 200L), "\r", appendLF = FALSE)
# convert html entities
htmlEnt <- grepl("&[#a-zA-Z]+;", out[["value"]])
if (any(htmlEnt)) out[["value"]][htmlEnt] <-
sapply(out[["value"]][htmlEnt], function(i) {
xml2::xml_text(xml2::read_html(charToRaw(i)))
}, USE.NAMES = FALSE)
message(". ", appendLF = FALSE)
# name can include from 0 to about 6 number groups, get all
# and concatenate to oid-like string such as "1.2.3.4.5.6",
# e.g. "9.8.2" which should be extracted from an example name:
# clinical...class9.analyzed...count8.@attributes.value2
# except where name is exactly one of dfn
onlyHere <- vapply(
out[["name"]], function(i) !any(i == dfn),
logical(1L), USE.NAMES = FALSE)
# collect all identifiers
out[["identifier"]][onlyHere] <- vapply(
stringi::stri_extract_all_regex(out[["name"]][onlyHere], "[#]?[0-9]+([.#]|$)"),
function(i) paste0(gsub("[.]", "", i[!grepl("^#", i)]), collapse = "."),
character(1L))
message(". ", appendLF = FALSE)
# remove numbers from name
regExps <- c(
"[.]?[0-9]+([.])",
"[.]?[0-9]+$",
"[.]+$",
"[.]?@attributes",
"#"
)
for (i in regExps) {
out[["name"]][onlyHere] <- gsub(
i, "\\1", out[["name"]][onlyHere])
}
# remove protection from numbers
out[["name"]][onlyHere] <- gsub(
"[#]([0-9]+)", "\\1",
out[["name"]][onlyHere], perl = TRUE)
# remove any duplicate rows
out <- unique(out)
# helper to expand identifier into columns
sortByOid <- function(oid) {
oid <- strsplit(oid$identifier, ".", fixed = TRUE)
maxLen <- max(sapply(oid, length))
oid <- lapply(oid, function(i) c(as.numeric(i), rep(0, maxLen - length(i))))
oid <- do.call(rbind, oid)
oid <- data.frame(oid)
return(oid)
}
# add oid columns for subsequent sorting
out <- data.frame(out, sortByOid(oid = out), check.names = FALSE)
# sort on _id, then name, then all oid columns (since number of
# columns in oid varies, have to exclude columns unused for sort)
oo <- with(out, do.call(order, out[, -match(c("value", "identifier"), names(out))]))
out <- out[oo, ]
# keep and sort columns
out <- out[c("_id", "identifier", "name", "value")]
# reset row numbers
row.names(out) <- NULL
# inform
message(
"\nTotal ", nrow(out), " rows, ",
length(unique(out[["name"]])),
" unique names of variables")
# output
if (any("tibble" == .packages())) return(tibble::as_tibble(out))
return(out)
} # end dfTrials2Long