-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdbFindFields.R
242 lines (204 loc) · 7.33 KB
/
dbFindFields.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
### ctrdata package
#' Find names of fields in the database collection
#'
#' Given part of the name of a field of interest to the user, this
#' function returns the full field names used in records that were
#' previously loaded into a collection
#' (using \link{ctrLoadQueryIntoDb}). Only names of fields that have
#' a value in the collection can be returned.
#' Set \code{sample = FALSE} to force screening all records in the
#' collection for field names, see below.
#' See \link{ctrShowOneTrial} to interactively find fields.
#'
#' The full names of child fields are returned in dot notation (e.g.,
#' \code{clinical_results.outcome_list.outcome.measure.class_list.class.title})
#' In addition, names of parent fields (e.g.,
#' \code{clinical_results}) are returned.
#' Data in parent fields is typically complex (nested), see
#' \link{dfTrials2Long} for easily handling it.
#' For field definitions of the registers, see
#' "Definition" in \link{ctrdata-registers}.
#' Note: When \code{dbFindFields} is first called after
#' \link{ctrLoadQueryIntoDb}, it will take a moment.
#'
#' @param namepart A character string (can be a regular expression,
#' including Perl-style) to be searched among all field names (keys)
#' in the collection, case-insensitive. The default `".*"` lists all fields.
#'
#' @param sample If \code{TRUE} (default), uses a sample of only 5 trial
#' records per register to identify fields, to rapidly return a possibly
#' incomplete set of field names.
#' If \code{FALSE}, uses all trial records in the collection, which will
#' take more time with more trials but ensures to returns all names of
#' all fields in the collection.
#'
#' @param verbose If \code{TRUE}, prints additional information
#' (default \code{FALSE}).
#'
#' @importFrom nodbi docdb_query
#'
#' @inheritParams ctrDb
#'
#' @return Vector of strings with full names of field(s) found,
#' ordered by register and alphabet, see examples. Names of the vector
#' are the names of the register holding the respective fields. The field
#' names can be fed into \link{dbGetFieldsIntoDf} to extract the
#' data for the field(s) from the collection into a data frame.
#'
#' @export
#'
#' @examples
#'
#' dbc <- nodbi::src_sqlite(
#' dbname = system.file("extdata", "demo.sqlite", package = "ctrdata"),
#' collection = "my_trials",
#' RSQLite::SQLITE_RO)
#'
#' dbFindFields(namepart = "date", con = dbc)[1:5]
#'
#' # view all 1880+ fields from all registers:
#'
#' allFields <- dbFindFields(con = dbc, sample = FALSE)
#'
#' if (interactive()) View(data.frame(
#' register = names(allFields),
#' field = allFields))
#'
dbFindFields <- function(namepart = ".*",
con,
sample = TRUE,
verbose = FALSE) {
## sanity checks
if (!is.atomic(namepart)) stop("'namepart' should be atomic.", call. = FALSE)
if (length(namepart) > 1) stop("'namepart' should have one element.", call. = FALSE)
if (namepart == "") stop("Empty 'namepart' parameter.", call. = FALSE)
## check database connection
if (is.null(con$ctrDb)) con <- ctrDb(con = con)
## check if cache environment has entry for the database
keyslist <- ctrCache(
xname = paste0("keyslist_", con$db, "/", con$collection, "_timestamp")
)
## get sample reference value
cacheSmpl <- ctrCache(
xname = paste0("keyslist_", con$db, "/", con$collection, "_sample")
)
## get cache reference value
cacheRef <- as.character(rev(unlist(try(nodbi::docdb_query(
src = con, key = con$collection, query = '{"_id": "meta-info"}',
fields = '{"queries.query-timestamp": 1, "_id": 0}'
), silent = TRUE)))[1])
## invalidate cache
cacheOutdated <- is.null(keyslist) || (cacheRef != keyslist) || (!sample & cacheSmpl)
## get keyslist
if (cacheOutdated) {
# inform user
message("Finding fields in database collection", appendLF = FALSE)
if (sample) {
n <- 5L
message(" (sampling ", n, " trial records per register) ", appendLF = FALSE)
} else {
message(" (may take some time) ", appendLF = FALSE)
n <- -1L
}
# construct queries
queries <- sapply(
registerList,
function(r) paste0('{"ctrname": "', r, '"}'),
simplify = TRUE,
USE.NAMES = TRUE)
# adding queries for records with results data, sequence important
queries <- c(
queries,
"EUCTR" = '{"trialInformation.fullTitle": {"$regex": ".+"}}',
"CTGOV" = '{"clinical_results.outcome_list.outcome.description": {"$regex": ".+"}}',
"CTGOV2" = '{"resultsSection.outcomeMeasuresModule.outcomeMeasures.type": {"$regex": ".+"}}')
# get names
keyslist <- list()
# iterate over registers
for (r in registerList) {
# iterate over queries for register
qs <- queries[names(queries) == r]
# when not sampling, get all records
if (!sample) qs <- qs[1]
# run queries from back
for (q in rev(qs)) {
message(". ", appendLF = FALSE)
keysAdd <- nodbi::docdb_query(
src = con,
key = con$collection,
query = q,
listfields = TRUE,
limit = n
)
# check if no need to run further queries
if (!is.null(keysAdd) &&
length(keysAdd) > 0L) break
} # for q in qs
# accumulate keys
keyslist[r] <- list(keysAdd[keysAdd != "_id" & keysAdd != ""])
} # for r in registerList
message()
## store keyslist to environment (cache)
if (length(keyslist) > 0L) {
ctrCache(
xname = paste0("keyslist_", con$db, "/", con$collection),
xvalue = keyslist
)
ctrCache(
xname = paste0("keyslist_", con$db, "/", con$collection, "_timestamp"),
xvalue = cacheRef
)
ctrCache(
xname = paste0("keyslist_", con$db, "/", con$collection, "_sample"),
xvalue = sample
)
message("Field names cached for this session.")
}
} else {
message("Using cache of fields. ")
if (cacheSmpl) message(
"Sample used, specify 'sample = FALSE' to digest all records.", "")
keyslist <- ctrCache(
xname = paste0("keyslist_", con$db, "/", con$collection),
verbose = verbose
)
} # generate keyslist
## inform user of unexpected situation
if (length(keyslist) == 0L) {
warning("No keys could be extracted, please check database ",
"and collection: ", con$db, "/", con$collection,
call. = FALSE
)
}
# process keys
# remove duplicates within registers
keyslist <- lapply(keyslist, unique)
# remove duplicates within registers
keyslist <- lapply(keyslist, sort)
# mangle list into named vector
lenKeyslist <- sapply(keyslist, length)
keyslist <- unlist(keyslist)
names(keyslist) <- unlist(sapply(
seq_along(lenKeyslist),
function(i) rep(
names(lenKeyslist)[i],
times = lenKeyslist[i])
), use.names = FALSE)
# now do the actual search and find for key name parts
keyslist <- keyslist[grepl(
pattern = namepart, x = keyslist,
ignore.case = TRUE, perl = TRUE
)]
# user info
if (verbose) {
fieldsTbl <- table(names(keyslist))
message(
"Found ", length(keyslist), " fields in ",
paste0(names(fieldsTbl), collapse = " / "), ": ",
paste0(fieldsTbl, collapse = " / "))
}
# return value if no fields found
if (!length(keyslist)) keyslist <- ""
# return the match(es)
return(keyslist)
} # end dbFindFields