@@ -4,13 +4,26 @@ conf <- BioDataScience::config()
4
4
library(shiny )
5
5
library(learndown )
6
6
library(BioDataScience2 )
7
+ library(dplyr )
8
+ library(tidyr )
9
+ library(flow )
10
+ library(chart )
7
11
8
12
# add news functions ----
9
13
# # This function move to a package
10
14
15
+ # CAH for SciViews, version 1.1.1
16
+ # Copyright (c) 2021, Philippe Grosjean (phgrsojean@sciviews.org)
17
+
18
+ # dist is really a dissimilarity matrix => we use dissimilarity() as in the
19
+ # {cluster} package, i.e., class is c("dissimilarity", "dist")
20
+ # TODO: also make a similarity object and convert between the two
21
+ # fun can be stats::dist, vegan::vegdist, vegan::designdist, cluster::daisy
22
+ # factoextra::get_dist and probably other dist-compatible functions
23
+ # Depending on method =, use either vegan::vegdist or stats::dist as default fun
11
24
dissimilarity <- function (data , formula = ~ . , subset = NULL ,
12
- method = " euclidean" , scale = FALSE , rownames.col = " rowname" ,
13
- transpose = FALSE , fun = NULL , ... ) {
25
+ method = " euclidean" , scale = FALSE , rownames.col = " rowname" ,
26
+ transpose = FALSE , fun = NULL , ... ) {
14
27
# TODO: get more meaningful warnings and errors by replacing fun by actual
15
28
# name of the function
16
29
if (is.null(fun )) {# Default function depends on the chosen method
@@ -104,7 +117,7 @@ as.dissimilarity.matrix <- function(x, ...) {
104
117
105
118
# We want to print only the first few rows and columns
106
119
print.dissimilarity <- function (x , digits.d = 3L , rownames.lab = " labels" ,
107
- ... ) {
120
+ ... ) {
108
121
mat <- as.matrix(x )
109
122
mat <- format(round(mat , digits.d ))
110
123
diag(mat ) <- " "
@@ -128,7 +141,7 @@ print.dissimilarity <- function(x, digits.d = 3L, rownames.lab = "labels",
128
141
more_info <- " (transposed data)"
129
142
}
130
143
cat(" Dissimilarity matrix with metric: " , attr(x , " metric" ),
131
- more_info , " \n " , sep = " " )
144
+ more_info , " \n " , sep = " " )
132
145
print(tbl )
133
146
invisible (x )
134
147
}
@@ -144,14 +157,14 @@ nobs.dissimilarity <- function(object, ...)
144
157
# TODO: `[` by first transforming into a matrix with as.matrix()
145
158
146
159
autoplot.dissimilarity <- function (object , order = TRUE , show.labels = TRUE ,
147
- lab.size = NULL , gradient = list (low = " red" , mid = " white" , high = " blue" ),
148
- ... ) {
160
+ lab.size = NULL , gradient = list (low = " red" , mid = " white" , high = " blue" ),
161
+ ... ) {
149
162
factoextra :: fviz_dist(object , order = order , show_labels = show.labels ,
150
- lab_size = lab.size , gradient = gradient )
163
+ lab_size = lab.size , gradient = gradient )
151
164
}
152
165
153
166
chart.dissimilarity <- function (data , ... ,
154
- type = NULL , env = parent.frame())
167
+ type = NULL , env = parent.frame())
155
168
autoplot(data , type = type , ... )
156
169
157
170
# cluster object (inheriting from hclust)
@@ -234,7 +247,7 @@ augment.cluster <- function(x, data, k = NULL, h = NULL, ...) {
234
247
clst <- predict(x , k = k , h = h , ... )
235
248
if (nrow(data ) != length(clst )) {
236
249
stop(" Different number of items in " , msg , " (" ,nrow(data ) ,
237
- " ) and in the clusters (" , length(clst ), " )" )
250
+ " ) and in the clusters (" , length(clst ), " )" )
238
251
}
239
252
tibble :: add_column(data , .fitted = clst )
240
253
}
@@ -245,7 +258,7 @@ augment.cluster <- function(x, data, k = NULL, h = NULL, ...) {
245
258
# /beautiful-dendrogram-visualizations-in-r-5-must-known-methods
246
259
# -unsupervised-machine-learning
247
260
plot.cluster <- function (x , y , hang = - 1 , check = TRUE , type = " vertical" ,
248
- lab = " Height" , ... ) {
261
+ lab = " Height" , ... ) {
249
262
type <- match.arg(type [1 ], c(" vertical" , " horizontal" , " circular" ))
250
263
# type == "circular" is special because we need to transform as ape::phylo
251
264
if (type == " circular" ) {
@@ -269,11 +282,11 @@ plot.cluster <- function(x, y, hang = -1, check = TRUE, type = "vertical",
269
282
# TODO: should be nice to do similar function for other symbols too in SciViews
270
283
circle <- function (x = 0 , y = 0 , d = 1 , col = 0 , lwd = 1 , lty = 1 , ... )
271
284
symbols(x = x , y = y , circles = d / 2 , fg = col , lwd = lwd , lty = lty ,
272
- inches = FALSE , add = TRUE , ... )
285
+ inches = FALSE , add = TRUE , ... )
273
286
274
287
# TODO: make sure the dendrogram is correct with different ggplot themes
275
288
autoplot.cluster <- function (object , type = " vertical" , circ.text.size = 3 ,
276
- theme = theme_sciviews(), xlab = " " , ylab = " Height" , ... ) {
289
+ theme = theme_sciviews(), xlab = " " , ylab = " Height" , ... ) {
277
290
if (is.null(type ))
278
291
type <- " vertical"
279
292
type <- match.arg(type [1 ], c(" vertical" , " horizontal" , " circular" ))
@@ -301,49 +314,55 @@ autoplot.cluster <- function(object, type = "vertical", circ.text.size = 3,
301
314
scale_y_reverse() +
302
315
coord_polar(start = pi / 2 ) +
303
316
geom_text(data = label_df ,
304
- aes(x = id , y = - 0.02 , label = labels , hjust = hjust ),
305
- size = circ.text.size , angle = label_df $ angle , inherit.aes = FALSE ) +
317
+ aes(x = id , y = - 0.02 , label = labels , hjust = hjust ),
318
+ size = circ.text.size , angle = label_df $ angle , inherit.aes = FALSE ) +
306
319
theme(panel.border = element_blank(),
307
- axis.text = element_blank(),
308
- axis.line = element_blank(),
309
- axis.ticks.y = element_blank()) +
320
+ axis.text = element_blank(),
321
+ axis.line = element_blank(),
322
+ axis.ticks.y = element_blank()) +
310
323
ylab(" " )
311
324
312
325
} else if (type == " vertical" ) {# Vertical dendrogram
313
326
dendro <- dendro +
314
327
scale_x_continuous(breaks = seq_along(ddata $ labels $ label ),
315
- labels = ddata $ labels $ label ) +
328
+ labels = ddata $ labels $ label ) +
316
329
scale_y_continuous(expand = expansion(mult = c(0 , 0.02 ))) +
317
330
theme(panel.border = element_blank(),
318
- axis.text.x = element_text(angle = 90 , hjust = 1 , vjust = 0.5 ),
319
- axis.line.x = element_blank(),
320
- axis.ticks.x = element_blank(),
321
- axis.text.y = element_text(angle = 90 , hjust = 0.5 ))
331
+ axis.text.x = element_text(angle = 90 , hjust = 1 , vjust = 0.5 ),
332
+ axis.line.x = element_blank(),
333
+ axis.ticks.x = element_blank(),
334
+ axis.text.y = element_text(angle = 90 , hjust = 0.5 ))
322
335
323
336
} else {# Horizontal dendrogram
324
337
dendro <- dendro +
325
338
scale_x_continuous(breaks = seq_along(ddata $ labels $ label ),
326
- labels = ddata $ labels $ label , position = " top" ) +
339
+ labels = ddata $ labels $ label , position = " top" ) +
327
340
scale_y_reverse(expand = expansion(mult = c(0.05 , 0 ))) +
328
341
coord_flip() +
329
342
theme(panel.border = element_blank(),
330
- axis.line.y = element_blank(),
331
- axis.ticks.y = element_blank())
343
+ axis.line.y = element_blank(),
344
+ axis.ticks.y = element_blank())
332
345
}
333
346
dendro
334
347
}
335
348
336
349
chart.cluster <- function (data , ... ,
337
- type = NULL , env = parent.frame())
350
+ type = NULL , env = parent.frame())
338
351
autoplot(data , type = type , ... )
339
352
353
+ # To indicate where to cut in the dendrogram, one could use `geom_hline()`,
354
+ # but when the dendrogram is horizontal or circular, this is suprizing. So,
355
+ # I define geom_dendroline(h = ....)
356
+ geom_dendroline <- function (h , ... )
357
+ geom_hline(yintercept = h , ... )
340
358
341
359
# data ----
342
- penguins <- read(" penguins" , package = " palmerpenguins" )
360
+ penguins <- data.io :: read(" penguins" , package = " palmerpenguins" )
343
361
344
362
penguins %> . %
345
363
# filter(., sex == "male") %>.%
346
- select(. , species , bill_length_mm , bill_depth_mm , flipper_length_mm , body_mass_g ) %> . %
364
+ select(. , species , bill_length_mm , bill_depth_mm , flipper_length_mm ,
365
+ body_mass_g ) %> . %
347
366
drop_na(. ) - > peng
348
367
349
368
peng %> . %
@@ -356,10 +375,10 @@ score_cah <- function(x, reference = peng$species, digits = 5) {
356
375
max_gr <- apply(tab , 1 , which.max )
357
376
tab [ , ]
358
377
359
- if (length(unique(max_gr )) < 3 )
378
+ if (length(unique(max_gr )) < 3 )
360
379
res <- " Votre CAH ne permet pas de retrouver les 3 groupes. Un ou plusieurs groupes sont confondus."
361
380
362
- if (length(unique(max_gr )) == 3 ) {
381
+ if (length(unique(max_gr )) == 3 ) {
363
382
tot <- apply(tab , 1 , max ) / rowSums(tab )
364
383
res <- paste0(" Votre CAH permet de discerner 3 groupes avec une précision de " , round((100 * sum(tot )/ nlevels(reference )),digits = digits ), " %." )
365
384
}
@@ -369,15 +388,16 @@ score_cah <- function(x, reference = peng$species, digits = 5) {
369
388
# UI -----
370
389
371
390
ui <- fluidPage(
372
- learndownShiny(" Regroupement d'espèces de manchôts avec la classification hiérarchique ascendante ." ),
391
+ learndownShiny(" Classification hiérarchique ascendante sur des mesures de manchots d'antarctique ." ),
373
392
374
393
sidebarLayout(
375
394
sidebarPanel(
376
- p(" Vous avez à disposition 342 manchôts de 3 espèces différentes. Trouvez les meilleurs paramètres afin d'obtenir la plus haute similitude entre votre CAH et les observations de terrain ." ),
377
- p(" Les variables monitorées sont les suivante : la longueur du bec (mm), la profondeur du bec (mm), la longueur de la nageoire (mm), la masse (g)." ),
378
- selectInput(" method_dist" , " Indice de distance" , choices = c(" euclidian" , " bray" , " canberra" , " manhattan" )),
395
+ p(" Vous avez à disposition des mesures sur 342 manchots de 3 espèces différentes. Trouvez les meilleurs paramètres pour votre CAH afin d'optimiser votre regroupement ." ),
396
+ p(" Les variables mesurées sont les suivantes : la longueur du bec (mm), la largeur du bec (mm), la longueur de la nageoire (mm) et la masse (g)." ),
397
+ selectInput(" method_dist" , " Métrique de distance" , choices = c(" euclidian" , " bray" , " canberra" , " manhattan" )),
379
398
selectInput(" scale" , " Standardisation" , choices = c(FALSE , TRUE )),
380
- selectInput(" method_clust" , " Méthode de CAH" , choices = c(" complete" , " single" ," average" , " ward.D2" )),
399
+ selectInput(" method_clust" , " Méthode de CAH" ,
400
+ choices = c(" complete" , " single" , " average" , " ward.D2" )),
381
401
hr(),
382
402
submitQuitButtons()
383
403
),
@@ -399,11 +419,11 @@ ui <- fluidPage(
399
419
)
400
420
)
401
421
402
-
403
422
server <- function (input , output , session ) {
404
423
405
424
cah <- reactive({
406
- peng_dist <- dissimilarity(data = peng_red , scale = as.logical(input $ scale ), method = input $ method_dist )
425
+ peng_dist <- dissimilarity(data = peng_red , scale = as.logical(input $ scale ),
426
+ method = input $ method_dist )
407
427
peng_clust <- cluster(peng_dist , method = input $ method_clust )
408
428
peng_clust
409
429
})
@@ -429,14 +449,14 @@ server <- function(input, output, session) {
429
449
})
430
450
431
451
432
- trackEvents(session , input , output ,
433
- sign_in.fun = BioDataScience :: sign_in , config = conf )
434
- trackSubmit(session , input , output , max_score = 3 , solution =
435
- list (method_dist = " euclidian" , scale = " TRUE" , method_clust = " ward.D2" ),
436
- comment = " " ,
437
- message.success = " Correct, c'est la meilleur solution. La CAH obtient un score très bon de plus de 94 % de correspondace " ,
438
- message.error = " Incorrect, un meilleur choix des paramètres est possible." )
439
- trackQuit(session , input , output , delay = 20 )
452
+ trackEvents(session , input , output ,
453
+ sign_in.fun = BioDataScience :: sign_in , config = conf )
454
+ trackSubmit(session , input , output , max_score = 3 , solution =
455
+ list (method_dist = " euclidian" , scale = " TRUE" , method_clust = " ward.D2" ),
456
+ comment = " " ,
457
+ message.success = " Correct, c'est la meilleur solution. La CAH obtient un score très bon de plus de 94 % de correspondance " ,
458
+ message.error = " Incorrect, un meilleur choix des paramètres est possible." )
459
+ trackQuit(session , input , output , delay = 20 )
440
460
}
441
461
442
462
shinyApp(ui , server )
0 commit comments