Skip to content

Commit a4d5c53

Browse files
Andreas BlätteAndreas Blätte
authored andcommitted
Merge branch 'dev'
2 parents 1127ea0 + 403f6d2 commit a4d5c53

File tree

17 files changed

+211
-125
lines changed

17 files changed

+211
-125
lines changed

DESCRIPTION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
Package: RcppCWB
22
Type: Package
33
Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB')
4-
Version: 0.5.2
5-
Date: 2022-03-28
4+
Version: 0.5.3
5+
Date: 2022-05-17
66
Author: Andreas Blaette [aut, cre],
77
Bernard Desgraupes [aut],
88
Sylvain Loiseau [aut],

NEWS.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,22 @@
1+
# RcppCWB 0.5.2.9001ff
2+
3+
* Fixed a bug in the `region_matrix_corpus()` C++ code that would not show any
4+
context at all if s_attribute expansion transgressed start or end of corpus.
5+
* Fixed a bug in the `region_matrix_corpus()` C++ code that would result from
6+
not considering that query matches may go cover more than one strucs of a
7+
structural attribute.
8+
* `corpus_info_file()` does not crash if INFO is not defined in the registry
9+
file (#62).
10+
* Implicit processing of arguments `sAttribute` and `pAttribute` as `s_attribute`
11+
or `p_attribute` respectively is now accompanied by a warning that arguments
12+
are deprectated.
13+
* The `check_corpus()` function distinguishes between whether a corpus is loaded
14+
in the CL and/or CQP context.
15+
* `cwb_huffcode()` and `cwb_compress_rdx()` have argument `delete` to trigger
16+
deleting redundant files after compression (#60).
17+
* `cqp_load_corpus` will internally upper corpus ID as required in the CQP
18+
context (#64).
19+
120
# RcppCWB 0.5.2
221

322
* The example for `corpus_data_dir()` dir not work as intended without

R/checks.R

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -32,37 +32,42 @@ check_registry <- function(registry){
3232
#' @rdname checks
3333
#' @export check_corpus
3434
#' @importFrom fs path
35-
check_corpus <- function(corpus, registry){
35+
#' @param cl A `logical` value, whether CL availability of corpus is required
36+
#' for positive result.
37+
#' @param cqp A `logical` value, whether CQP availability of corpus is required
38+
#' for positive result.
39+
check_corpus <- function(corpus, registry, cl = TRUE, cqp = TRUE){
3640

37-
registry <- fs::path(registry)
38-
39-
if (length(corpus) != 1L)
40-
stop("corpus needs to be a vector of length 1")
41-
42-
if (!is.character(corpus))
43-
stop("corpus needs to be a character vector")
41+
if (length(corpus) != 1L) stop("corpus needs to be a vector of length 1")
42+
if (!is.character(corpus)) stop("corpus needs to be a character vector")
4443

44+
registry <- fs::path(registry)
4545
if (isFALSE(dir.exists(registry)))
4646
stop(sprintf("Registry directory '%s' does not exist.", registry))
4747

48-
if (isFALSE(cqp_is_initialized())) cqp_initialize(registry = registry)
48+
if (cl){
49+
if (!tolower(corpus) %in% cl_list_corpora()){
50+
cl_load_corpus(corpus = corpus, registry = registry)
51+
}
52+
cl_availability <- corpus_is_loaded(corpus = corpus, registry = registry)
53+
}
4954

50-
if (!tolower(corpus) %in% cl_list_corpora()){
51-
52-
success_cl <- cl_load_corpus(corpus = corpus, registry = registry)
53-
if (isFALSE(success_cl))
54-
warning(sprintf("corpus '%s' is not loaded and cannot be loaded", corpus))
55-
56-
success_cqp <- cqp_load_corpus(corpus = toupper(corpus), registry = registry)
57-
if (isFALSE(success_cqp))
58-
warning(sprintf("corpus '%s' is not loaded and cannot be loaded", corpus))
59-
55+
if (cqp){
56+
if (isFALSE(cqp_is_initialized())) cqp_initialize(registry = registry)
57+
if (!toupper(corpus) %in% cqp_list_corpora()){
58+
cqp_load_corpus(corpus = toupper(corpus), registry = registry)
59+
}
60+
cqp_availability <- as.logical(
61+
.check_corpus(corpus = toupper(corpus))
62+
)
6063
}
61-
62-
if (.check_corpus(toupper(corpus)) == 0L)
63-
stop(sprintf("corpus %s is not available (check whether there is a typo)", sQuote(corpus)))
6464

65-
return( TRUE )
65+
if (!any(cl, cqp)) return(TRUE)
66+
if (all(cl, cqp)) return(all(cqp_availability, cl_availability))
67+
if (cl && !cqp) return(cl_availability)
68+
if (!cl && cqp) return(cqp_availability)
69+
70+
return(FALSE)
6671
}
6772

6873
#' @export check_s_attribute

R/cl.R

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ cl_lexicon_size <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_R
139139
#' fulltext <- paste(txt, collapse = " ")
140140
cl_cpos2struc <- function(corpus, s_attribute, cpos, registry = Sys.getenv("CORPUS_REGISTRY")){
141141
check_registry(registry)
142-
check_corpus(corpus, registry)
142+
check_corpus(corpus, registry, cqp = FALSE)
143143
check_s_attribute(corpus = corpus, registry = registry, s_attribute = s_attribute)
144144

145145
if (length(cpos) == 0L) return(integer())
@@ -151,7 +151,7 @@ cl_cpos2struc <- function(corpus, s_attribute, cpos, registry = Sys.getenv("CORP
151151
#' @rdname s_attributes
152152
cl_struc2cpos <- function(corpus, s_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), struc){
153153
check_registry(registry)
154-
check_corpus(corpus, registry)
154+
check_corpus(corpus, registry, cqp = FALSE)
155155
check_s_attribute(corpus = corpus, registry = registry, s_attribute = s_attribute)
156156
check_strucs(corpus = corpus, s_attribute = s_attribute, strucs = struc, registry = registry)
157157
.cl_struc2cpos(corpus = corpus, s_attribute = s_attribute, registry = registry, struc = struc)
@@ -160,7 +160,7 @@ cl_struc2cpos <- function(corpus, s_attribute, registry = Sys.getenv("CORPUS_REG
160160
#' @rdname s_attributes
161161
cl_struc2str <- function(corpus, s_attribute, struc, registry = Sys.getenv("CORPUS_REGISTRY")){
162162
check_registry(registry)
163-
check_corpus(corpus, registry)
163+
check_corpus(corpus, registry, cqp = FALSE)
164164
check_s_attribute(corpus = corpus, registry = registry, s_attribute = s_attribute)
165165
check_strucs(corpus = corpus, s_attribute = s_attribute, strucs = struc, registry = registry)
166166
.cl_struc2str(corpus = corpus, s_attribute = s_attribute, struc = struc, registry = registry)
@@ -169,7 +169,7 @@ cl_struc2str <- function(corpus, s_attribute, struc, registry = Sys.getenv("CORP
169169
#' @rdname s_attributes
170170
cl_cpos2lbound <- function(corpus, s_attribute, cpos, registry = Sys.getenv("CORPUS_REGISTRY")){
171171
check_registry(registry)
172-
check_corpus(corpus, registry)
172+
check_corpus(corpus, registry, cqp = FALSE)
173173
check_s_attribute(corpus = corpus, registry = registry, s_attribute = s_attribute)
174174

175175
if (length(cpos) == 0L) return(integer())
@@ -181,7 +181,7 @@ cl_cpos2lbound <- function(corpus, s_attribute, cpos, registry = Sys.getenv("COR
181181
#' @rdname s_attributes
182182
cl_cpos2rbound <- function(corpus, s_attribute, cpos, registry = Sys.getenv("CORPUS_REGISTRY")){
183183
check_registry(registry)
184-
check_corpus(corpus, registry)
184+
check_corpus(corpus, registry, cqp = FALSE)
185185
check_s_attribute(corpus = corpus, registry = registry, s_attribute = s_attribute)
186186

187187
check_cpos(corpus = corpus, p_attribute = "word", cpos = cpos, registry = registry)
@@ -267,45 +267,45 @@ cl_cpos2rbound <- function(corpus, s_attribute, cpos, registry = Sys.getenv("COR
267267
#'
268268
cl_cpos2str <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), cpos){
269269
check_registry(registry)
270-
check_corpus(corpus, registry)
270+
check_corpus(corpus, registry, cqp = FALSE)
271271
if (length(cpos) == 0L) return(integer())
272272
.cl_cpos2str(corpus = corpus, p_attribute = p_attribute, registry = registry, cpos = cpos)
273273
}
274274

275275
#' @rdname p_attributes
276276
cl_cpos2id <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), cpos){
277277
check_registry(registry)
278-
check_corpus(corpus, registry)
278+
check_corpus(corpus, registry, cqp = FALSE)
279279
if (length(cpos) == 0L) return(integer())
280280
.cl_cpos2id(corpus = corpus, p_attribute = p_attribute, registry = registry, cpos = cpos)
281281
}
282282

283283
#' @rdname p_attributes
284284
cl_id2str <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), id){
285285
check_registry(registry)
286-
check_corpus(corpus, registry)
286+
check_corpus(corpus, registry, cqp = FALSE)
287287
check_id(corpus = corpus, p_attribute = p_attribute, id = id, registry = registry)
288288
.cl_id2str(corpus = corpus, p_attribute = p_attribute, registry = registry, id = id)
289289
}
290290

291291
#' @rdname p_attributes
292292
cl_regex2id <- function(corpus, p_attribute, regex, registry = Sys.getenv("CORPUS_REGISTRY")){
293293
check_registry(registry)
294-
check_corpus(corpus, registry)
294+
check_corpus(corpus, registry, cqp = FALSE)
295295
.cl_regex2id(corpus = corpus, p_attribute = p_attribute, regex = regex, registry = registry)
296296
}
297297

298298
#' @rdname p_attributes
299299
cl_str2id <- function(corpus, p_attribute, str, registry = Sys.getenv("CORPUS_REGISTRY")){
300300
check_registry(registry)
301-
check_corpus(corpus, registry)
301+
check_corpus(corpus, registry, cqp = FALSE)
302302
.cl_str2id(corpus = corpus, p_attribute = p_attribute, str = str, registry = registry)
303303
}
304304

305305
#' @rdname p_attributes
306306
cl_id2freq <- function(corpus, p_attribute, id, registry = Sys.getenv("CORPUS_REGISTRY")){
307307
check_registry(registry)
308-
check_corpus(corpus, registry)
308+
check_corpus(corpus, registry, cqp = FALSE)
309309
check_p_attribute(p_attribute = p_attribute, corpus = corpus, registry = registry)
310310
check_id(corpus = corpus, p_attribute = p_attribute, id = id, registry = registry)
311311
.cl_id2freq(corpus = corpus, p_attribute = p_attribute, id = id, registry = registry)
@@ -315,7 +315,7 @@ cl_id2freq <- function(corpus, p_attribute, id, registry = Sys.getenv("CORPUS_RE
315315
#' @rdname p_attributes
316316
cl_id2cpos <- function(corpus, p_attribute, id, registry = Sys.getenv("CORPUS_REGISTRY")){
317317
check_registry(registry)
318-
check_corpus(corpus, registry)
318+
check_corpus(corpus, registry, cqp = FALSE)
319319
check_p_attribute(p_attribute = p_attribute, corpus = corpus, registry = registry)
320320
check_id(corpus = corpus, p_attribute = p_attribute, id = id, registry = registry)
321321
.cl_id2cpos(corpus = corpus, p_attribute = p_attribute, id = id, registry = registry)
@@ -394,7 +394,7 @@ cl_charset_name <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")){
394394
#' cl_struc_values("REUTERS", "places") # TRUE - attribute has values
395395
#' cl_struc_values("REUTERS", "date") # NA - attribute does not exist
396396
cl_struc_values <- function(corpus, s_attribute, registry = Sys.getenv("CORPUS_REGISTRY")){
397-
check_corpus(corpus = corpus, registry = registry)
397+
check_corpus(corpus = corpus, registry = registry, cqp = FALSE)
398398
registry <- normalizePath(path.expand(registry))
399399
i <- .cl_struc_values(corpus = corpus, s_attribute = s_attribute, registry = registry)
400400
if (i == 1L) TRUE else if (i == 0L) FALSE else if (i < 0L) as.integer(NA)
@@ -414,19 +414,20 @@ cl_struc_values <- function(corpus, s_attribute, registry = Sys.getenv("CORPUS_R
414414
#' @examples
415415
#' corpus_data_dir("REUTERS", registry = get_tmp_registry())
416416
corpus_data_dir <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")){
417-
check_corpus(corpus = corpus, registry = registry)
417+
check_corpus(corpus = corpus, registry = registry, cqp = FALSE)
418418
registry <- path(path_expand(registry))
419419
dir <- .corpus_data_dir(corpus = corpus, registry = registry)
420420
path(dir)
421421
}
422422

423423
#' @details `corpus_info_file()` will return the path to the info file for a
424-
#' corpus (class `fs_path` object).
424+
#' corpus (class `fs_path` object). If info file does not exist or INFO line
425+
#' is missing in the registry file, `NA` is returned.
425426
#' @rdname registry_info
426427
#' @examples
427428
#' corpus_info_file("REUTERS", registry = get_tmp_registry())
428429
corpus_info_file <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")){
429-
check_corpus(corpus = corpus, registry = registry)
430+
check_corpus(corpus = corpus, registry = registry, cqp = FALSE)
430431
registry <- path(path_expand(registry))
431432
fname <- .corpus_info_file(corpus = corpus, registry = registry)
432433
path(fname)
@@ -438,7 +439,7 @@ corpus_info_file <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")){
438439
#' @examples
439440
#' corpus_full_name("REUTERS", registry = get_tmp_registry())
440441
corpus_full_name <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")){
441-
check_corpus(corpus = corpus, registry = registry)
442+
check_corpus(corpus = corpus, registry = registry, cqp = FALSE)
442443
registry <- path(path_expand(registry))
443444
.corpus_full_name(corpus = corpus, registry = registry)
444445
}
@@ -449,7 +450,7 @@ corpus_full_name <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")){
449450
#' @examples
450451
#' corpus_p_attributes("REUTERS", registry = get_tmp_registry())
451452
corpus_p_attributes <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")){
452-
check_corpus(corpus = corpus, registry = registry)
453+
check_corpus(corpus = corpus, registry = registry, cqp = FALSE)
453454
registry <- path(path_expand(registry))
454455
.corpus_p_attributes(corpus = corpus, registry = registry)
455456
}
@@ -460,7 +461,7 @@ corpus_p_attributes <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")
460461
#' @examples
461462
#' corpus_s_attributes("REUTERS", registry = get_tmp_registry())
462463
corpus_s_attributes <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")){
463-
check_corpus(corpus = corpus, registry = registry)
464+
check_corpus(corpus = corpus, registry = registry, cqp = FALSE)
464465
registry <- path(path_expand(registry))
465466
.corpus_s_attributes(corpus = corpus, registry = registry)
466467
}
@@ -471,7 +472,7 @@ corpus_s_attributes <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")
471472
#' @examples
472473
#' corpus_properties("REUTERS", registry = get_tmp_registry())
473474
corpus_properties <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")){
474-
check_corpus(corpus = corpus, registry = registry)
475+
check_corpus(corpus = corpus, registry = registry, cqp = FALSE)
475476
registry <- path(path_expand(registry))
476477
.corpus_properties(corpus = corpus, registry = registry)
477478
}
@@ -492,7 +493,7 @@ corpus_property <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY"), pr
492493
length(property) == 1L,
493494
is.character(property)
494495
)
495-
check_corpus(corpus = corpus, registry = registry)
496+
check_corpus(corpus = corpus, registry = registry, cqp = FALSE)
496497
registry <- path(path_expand(registry))
497498
.corpus_property(corpus = corpus, registry = registry, property = property)
498499
}

R/cqp.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,5 +212,5 @@ matrix_to_subcorpus <- function(region_matrix, corpus, subcorpus){
212212
#' @export cqp_load_corpus
213213
#' @rdname cqp_initialize
214214
cqp_load_corpus <- function(corpus, registry){
215-
as.logical(.cqp_load_corpus(corpus = corpus, registry = registry))
215+
as.logical(.cqp_load_corpus(corpus = toupper(corpus), registry = registry))
216216
}

R/cwb.R

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,9 @@ cwb_makeall <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGIS
9393

9494
#' @rdname cwb_utils
9595
#' @export cwb_huffcode
96-
cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), quietly = FALSE){
96+
#' @param delete A `logical` value, whether to remove redundant files after
97+
#' compression.
98+
cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), quietly = FALSE, delete = TRUE){
9799
huffcode <- function()
98100
.cwb_huffcode(x = corpus, p_attribute = p_attribute, registry_dir = registry)
99101

@@ -102,6 +104,15 @@ cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGI
102104
} else {
103105
success <- huffcode()
104106
}
107+
108+
if (delete){
109+
data_dir <- corpus_data_dir(corpus = corpus, registry = registry)
110+
fname <- path(data_dir, sprintf("%s.corpus", p_attribute))
111+
if (!file.exists(fname)) warning("cwb_huffcode: file to delete missing")
112+
removed <- file.remove(fname)
113+
if (removed) if (!quietly) message("redundant file deleted: ", fname)
114+
}
115+
105116
success
106117

107118
}
@@ -114,7 +125,7 @@ cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGI
114125
#' p_attribute = "word",
115126
#' registry = get_tmp_registry()
116127
#' )
117-
cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), quietly = FALSE){
128+
cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), quietly = FALSE, delete = TRUE){
118129
compress_rdx <-function()
119130
.cwb_compress_rdx(x = corpus, p_attribute = p_attribute, registry_dir = registry)
120131

@@ -123,6 +134,21 @@ cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_
123134
} else {
124135
success <- compress_rdx()
125136
}
137+
138+
if (delete){
139+
data_dir <- corpus_data_dir(corpus = corpus, registry = registry)
140+
141+
rev_file <- path(data_dir, sprintf("%s.corpus.rev", p_attribute))
142+
if (!file.exists(rev_file)) warning("cwb_huffcode: file to delete missing")
143+
removed <- file.remove(rev_file)
144+
if (removed) if (!quietly) message("redundant file deleted: ", rev_file)
145+
146+
rdx_file <- path(data_dir, sprintf("%s.corpus.rdx", p_attribute))
147+
if (!file.exists(rdx_file)) warning("cwb_huffcode: file to delete missing")
148+
removed <- file.remove(rdx_file)
149+
if (removed) if (!quietly) message("redundant file deleted: ", rdx_file)
150+
}
151+
126152
success
127153
}
128154

cran-comments.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
## General remarks
22

3-
This is a "standard" release with some new functionality. No changes to the
3+
This is a maintenance release with bug fixes. No changes to the
44
setup of the package.
55

6-
Previous aspects I repeat here:
6+
Previous aspects I repeat:
77

88
- CRAN package check results report 'GNU make is a SystemRequirements'. Using GNU
99
make remains important for the portability of the C code. There would be a great
@@ -18,7 +18,8 @@ change.
1818
## Test environments
1919

2020
* CI checks with GitHub Actions (Windows/macOS/Ubuntu)
21-
* local macOS R 4.1.3 (arm64)
21+
* R winbuilder (R 4.2 release and devel)
22+
* local macOS, still R 4.1.3 (arm64)
2223

2324

2425
## R CMD check results

inst/extdata/cwb/registry/unga

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ NAME "UNGA"
88
ID unga
99
# path to binary data files
1010
HOME /private/var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T/RtmpMJzg3A/revdep1207b38ee9c65/RcppCWB/extdata/cwb/indexed_corpora/unga
11-
# optional info file (displayed by ",info;" command in CQP)
12-
INFO /private/var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T/RtmpMJzg3A/revdep1207b38ee9c65/RcppCWB/extdata/cwb/indexed_corpora/unga/.info
1311

1412
# corpus properties provide additional information about the corpus:
1513
##:: charset = "utf8"

0 commit comments

Comments
 (0)