-
Notifications
You must be signed in to change notification settings - Fork 62
/
Copy pathncbi_children.R
193 lines (185 loc) · 8.08 KB
/
ncbi_children.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#' @title Search NCBI for children of a taxon
#'
#' @description Search the NCBI Taxonomy database for uids of children of taxa.
#' Taxa can be referenced by name or uid. Referencing by name is faster
#'
#' In a few cases, different taxa have the same name (e.g. Satyrium; see
#' examples). If one of these are searched for then the children of both taxa
#' will be returned. This can be avoided by using a uid instead of the name or
#' specifying an ancestor. If an ancestor is provided, only children of both
#' the taxon and its ancestor are returned. This will only fail if there are
#' two taxa with the same name and the same specified ancestor.
#'
#' @export
#' @param name (`character`) The string to search for. Only exact matches
#' found the name given will be returned. Not compatible with `id`.
#' @param id (`character`/`numeric`) The uid to search for. Not compatible with
#' `name`.
#' @param start The first record to return. If omitted, the results are
#' returned from the first record (start=0).
#' @param max_return (`numeric; length=1`) The maximum number of children
#' to return.
#' @param ancestor (`character`) The ancestor of the taxon being searched
#' for. This is useful if there could be more than one taxon with the same
#' name. Has no effect if `id` is used.
#' @param out_type (character) Currently either `"summary"` or `"uid"`:
#' * `summary` The output is a list of `data.frame` with children uid, name,
#' and rank.
#' * `uid` A list of character vectors of children uids
#' @param ambiguous `logical; length 1` If `FALSE`, children taxa
#' with words like "unclassified", "unknown", "uncultured", or "sp." are
#' removed from the output. NOTE: This option only applies when
#' `out_type= "summary"`.
#' @param key (character) NCBI Entrez API key. optional. See Details.
#' @param ... Curl options passed on to [crul::HttpClient]
#' @return The output type depends on the value of the `out_type`
#' parameter. Taxa that cannot be found will result in `NA`s and a lack
#' of children results in an empty data structure.
#' @seealso [ncbi_get_taxon_summary()], [children()]
#' @author Zachary Foster \email{zacharyfoster1989@@gmail.com}
#'
#' @section Authentication:
#' See [taxize-authentication()] for help on authentication.
#' We strongly recommend getting an API key
#'
#' @section HTTP version:
#' We hard code `http_version = 2L` to use HTTP/1.1 in HTTP requests to
#' the Entrez API. See `curl::curl_symbols('CURL_HTTP_VERSION')`
#'
#' @section Rate limits:
#' In case you run into errors due to your rate limit being exceeded, see
#' [taxize_options()], where you can set `ncbi_sleep`.
#'
#' @examples \dontrun{
#' ncbi_children(name="Satyrium") #Satyrium is the name of two different genera
#' ncbi_children(name="Satyrium", ancestor="Eumaeini") # A genus of butterflies
#' ncbi_children(name="Satyrium", ancestor="Orchidaceae") # A genus of orchids
#' ncbi_children(id="266948") #"266948" is the uid for the butterfly genus
#' ncbi_children(id="62858") #"62858" is the uid for the orchid genus
#'
#' # use curl options
#' ncbi_children(name="Satyrium", ancestor="Eumaeini", verbose = TRUE)
#' }
ncbi_children <- function(name = NULL, id = NULL, start = 0, max_return = 1000,
ancestor = NULL, out_type = c("summary", "uid"),
ambiguous = FALSE, key = NULL, ...) {
assert(name, "character")
assert(id, c("character", "numeric", "uid"))
key <- getkey(key, "ENTREZ_KEY")
# Constants -----------------------------------------------------------------
ambiguous_regex <- paste(
sep = "|", "unclassified", "environmental", "uncultured", "unknown",
"unidentified", "candidate", "\\ssp\\.", "s\\.l\\.", "sensu lato", "clone",
"miscellaneous", "candidatus", "affinis", "aff\\.", "incertae sedis",
"mixed", "samples", "libaries")
# Argument validation -------------------------------------------------------
if (sum(c(is.null(name), is.null(id))) != 1) {
stop("Either name or id must be specified, but not both")
}
out_type <- match.arg(out_type)
# Get name from id ----------------------------------------------------------
if (is.null(name)) {
if (!inherits(id, 'uid')) attr(id, 'class') <- 'uid'
id_taxonomy <- classification(id, db = 'ncbi')
id_taxonomy <- lapply(id_taxonomy, function(z) {
if (!(inherits(z, "data.frame"))) data.frame(NULL) else z
})
name <- vapply(id_taxonomy,
function(x) ifelse(nrow(x) > 0, x$name[nrow(x)],
as.character(NA)), character(1))
ancestor <- vapply(id_taxonomy,
function(x) ifelse(nrow(x) > 1, x$name[nrow(x) - 1],
as.character(NA)), character(1))
# set id to NULL because we're using name now
id <- NULL
} else if (is.null(ancestor)) {
ancestor <- rep(NA, length(name))
}
# Function to search for queries one at a time ------------------------------
single_search <- function(name, ancestor, ...) {
if (is.na(name)) return(NA)
# Make eutils esearch query - - - - - - - - - - - - - - - - - - - - - - -
if (is.na(ancestor)) {
ancestor_query <- NULL
} else {
ancestor_query <- paste0("+AND+", ancestor, "[subtree]")
}
if (is.null(id)) {
args <- tc(list(
db = 'taxonomy',
term = paste0(name, "[Next+Level]", ancestor_query),
RetMax = max_return,
RetStart = start,
api_key = key
))
args$term <- gsub("\\+", " ", args$term)
# Search ncbi for children - - - - - - - - - - - - - - - - - - - - - - -
cli <- crul::HttpClient$new(ncbi_base(), headers = tx_ual,
opts = list(http_version = 2L, ...))
rr <- cli$get('entrez/eutils/esearch.fcgi', query = args)
rr$raise_for_status()
raw_results <- rr$parse("UTF-8")
# Parse results - - - - - - - - - - - - - - - - - - - - - - - - - - - -
results <- xml2::read_xml(raw_results)
children_uid <- xml_text_all(results, "//eSearchResult/IdList/Id")
} else {
args <- tc(list(
dbfrom = 'taxonomy',
db = 'taxonomy',
id = id,
term = paste0(name, "[Next+Level]"),
RetMax = max_return,
RetStart = start,
api_key = key
))
args$term <- gsub("\\+", " ", args$term)
# Search ncbi for children - - - - - - - - - - - - - - - - - - - - - - -
cli <- crul::HttpClient$new(ncbi_base(), headers = tx_ual,
opts = list(http_version = 2L, ...))
rr <- cli$get('entrez/eutils/elink.fcgi', query = args)
rr$raise_for_status()
raw_results <- rr$parse("UTF-8")
# Parse results - - - - - - - - - - - - - - - - - - - - - - - - - - - -
results <- xml2::read_xml(raw_results)
children_uid <- xml_text_all(results,
"//eLinkResult/LinkSet/LinkSetDb/Link/Id")
}
if (length(children_uid) == 0) {
if (out_type == "summary") {
output <- data.frame(
childtaxa_id = character(),
childtaxa_name = character(),
childtaxa_rank = character(),
stringsAsFactors = FALSE
)
} else {
output <- numeric()
}
} else {
if (out_type == "summary") {
output <- ncbi_get_taxon_summary(children_uid, key = key, ...)
names(output) <- c("childtaxa_id", "childtaxa_name", "childtaxa_rank")
# Remove ambiguous results - - - - - - - - - - - - - - -
if (!ambiguous) {
output <- output[!grepl(ambiguous_regex, output$childtaxa_name,
ignore.case = TRUE), ]
}
} else {
output <- children_uid
}
# numeric row names can be misleading after filtering
rownames(output) <- NULL
}
# NCBI limits requests to three per second
ncbi_rate_limit_pause(key)
return(output)
}
# Combine the result of multiple searches -----------------------------------
# output <- Map(single_search, name, ancestor)
output <- list()
for (i in seq_along(name)) {
output[[i]] <- single_search(name[[i]], ancestor[[i]], ...)
}
if (is.null(id)) names(output) <- name else names(output) <- id
return(output)
}