forked from nf-core/differentialabundance
-
Notifications
You must be signed in to change notification settings - Fork 0
/
differentialabundance_report.Rmd
1110 lines (914 loc) · 43.8 KB
/
differentialabundance_report.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
output:
html_document:
toc: true # table of contents
toc_float: true # float the table of contents to the left of the main document content
toc_depth: 4 # header levels 1,2,3
theme: default
number_sections: false # add section numbering to headers
df_print: paged # tables are printed as an html table with support for pagination over rows and columns
highlight: pygments
pdf_document: true
pdf_document:
toc: yes
date: "`r Sys.Date()`"
params:
meta: NULL
input_dir: NULL
artifact_dir: NULL
cpus: 1
study_type: NULL
study_name: NULL
study_abundance_type: NULL
report_file: NULL,
report_title: NULL,
report_contributors: NULL
report_author: NULL,
report_description: NULL,
report_scree: NULL
report_round_digits: NULL
observations_type: NULL
observations: NULL # GSE156533.samplesheet.csv
observations_id_col: NULL
observations_name_col: NULL
features: NULL
features_type: NULL
features_id_col: NULL
features_name_col: NULL
features_metadata_cols: NULL
features_gtf_feature_type: NULL
features_gtf_table_first_field: NULL
exploratory_log2_assays: NULL
raw_matrix: null # e.g. 0_salmon.merged.gene_counts_length_scaled.tsv
normalised_matrix: null
variance_stabilised_matrix: null # e.g. test_files/3_treatment-WT-P23H.vst.tsv
contrasts_file: null # e.g. GSE156533.contrasts.csv
differential_table: file.csv
proteus_measurecol_prefix: NULL
proteus_norm_function: NULL
proteus_plotsd_method: NULL
proteus_plotmv_loess: NULL
proteus_palette_name: NULL
affy_cel_files_archive: NULL
affy_file_name_col: NULL
affy_background: NULL
affy_bgversion: NULL
affy_destructive: NULL
affy_cdfname: NULL
affy_rm_mask: NULL
affy_rm_outliers: NULL
affy_rm_extra: NULL
affy_build_annotation: NULL
limma_ndups: NULL
limma_spacing: NULL
limma_block: NULL
limma_correlation: NULL
limma_method: NULL
limma_proportion: NULL
limma_stdev_coef_lim: NULL
limma_trend: NULL
limma_robust: NULL
limma_winsor_tail_p: NULL
limma_adjust_method: NULL
limma_p_value: NULL
limma_lfc: NULL
limma_confint: NULL
exploratory_n_features: null
exploratory_clustering_method: null
exploratory_cor_method: null
exploratory_whisker_distance: null
exploratory_mad_threshold: null
exploratory_main_variable: null
exploratory_assay_names: NULL
exploratory_final_assay: NULL
exploratory_palette_name: NULL
versions_file: null # e.g 17_software_versions.yml
logo: null
css: null
citations: null
filtering_min_samples: 1
filtering_min_abundance: 1
filtering_min_proportion: NULL
filtering_grouping_var: NULL
differential_file_suffix: NULL
differential_feature_id_column: NULL
differential_feature_name_column: NULL
differential_fc_column: NULL
differential_pval_column: NULL
differential_qval_column: NULL
differential_min_fold_change: NULL
differential_foldchanges_logged: NULL
differential_max_pval: NULL
differential_max_qval: NULL
differential_palette_name: NULL
differential_subset_to_contrast_samples: NULL
deseq2_test: NULL
deseq2_fit_type: NULL
deseq2_sf_type: NULL
deseq2_min_replicates_for_replace: NULL
deseq2_use_t: NULL
deseq2_lfc_threshold: NULL
deseq2_alt_hypothesis: NULL
deseq2_independent_filtering: NULL
deseq2_p_adjust_method: NULL
deseq2_alpha: NULL
deseq2_minmu: NULL
deseq2_vs_method: NULL
deseq2_shrink_lfc: NULL
deseq2_cores: NULL
deseq2_vs_blind: NULL
deseq2_vst_nsub: NULL
gsea_run: false
gsea_nperm: NULL
gsea_permute: NULL
gsea_scoring_scheme: NULL
gsea_metric: NULL
gsea_sort: NULL
gsea_order: NULL
gsea_set_max: NULL
gsea_set_min: NULL
gsea_norm: NULL
gsea_rnd_type: NULL
gsea_make_sets: NULL
gsea_median: NULL
gsea_num: NULL
gsea_plot_top_x: NULL
gsea_rnd_seed: NULL
gsea_save_rnd_lists: NULL
gsea_zip_report: NULL
gsea_chip_file: NULL
gprofiler2_run: false
gprofiler2_organism: NULL
gprofiler2_significant: NULL
gprofiler2_measure_underrepresentation: NULL
gprofiler2_correction_method: NULL
gprofiler2_sources: NULL
gprofiler2_evcodes: NULL
gprofiler2_max_qval: NULL
gprofiler2_token: NULL
gprofiler2_background_file: NULL
gprofiler2_background_column: NULL
gprofiler2_domain_scope: NULL
gprofiler2_min_diff: NULL
gprofiler2_palette_name: NULL
gene_sets_files: NULL
---
<!-- Load libraries -->
```{r, include=FALSE}
library(knitr)
library(yaml)
library(shinyngs)
library(plotly)
library(DT)
```
<!-- Define some functions -->
```{r, include=FALSE}
round_dataframe_columns <- function(df, columns = NULL, digits = -1) {
if (digits == -1) {
return(df) # if -1, return df without rounding
}
df <- data.frame(df, check.names = FALSE) # make data.frame from vector as otherwise, the format will get messed up
if (is.null(columns)) {
columns <- colnames(df)[(unlist(lapply(df, is.numeric), use.names=F))] # extract only numeric columns for rounding
}
df[,columns] <- format(data.frame(df[, columns], check.names = FALSE), scientific=T, digits=params$report_round_digits)
# Convert columns back to numeric
for (c in columns) {
df[[c]][grep("^ *NA$", df[[c]])] <- NA
df[[c]] <- as.numeric(df[[c]])
}
df
}
```
```{r include = FALSE}
# Load the datatables js
datatable(NULL)
```
```{r, include=FALSE}
versions <- unlist(yaml.load_file(file.path(params$input_dir, params$versions_file)), recursive = FALSE)
params_table <- data.frame(Parameter = names(unlist(params)), Value = unlist(params), row.names = NULL)
# We'll subset the params table for different report sections
make_params_table <- function(name, pattern = NULL, remove_pattern = FALSE){
subparams <- params_table
if (! is.null(pattern)){
subparams <- subparams[grep(pattern, subparams$Parameter),]
}
if (remove_pattern){
subparams$Parameter <- sub(pattern, '', subparams$Parameter)
}
if (nrow(subparams) > 10){
dom <- 'tp'
}else{
dom <- 't'
}
print( htmltools::tagList(datatable(subparams, caption = paste("Parameters used for", name), rownames = FALSE, options = list(dom = dom)) ))
}
report_title <- paste0('Differential ', params$features_type, ' abundance report', ifelse(is.null(params$report_title), '', paste0(': ', params$report_title)))
report_subtitle <- paste0(ifelse(is.null(params$report_author), '', paste0('By ', params$report_author, ', ')), '<br>differentialabundance workflow version', versions[["Workflow.nf-core/differentialabundance"]])
```
---
title: "<img src=\"`r file.path(params$input_dir, params$logo)`\" style=\"float: left;\"/>`r report_title`"
subtitle: `r report_subtitle`
---
\
<!-- set notebook defaults -->
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
<!-- Include the CSS and set the logo -->
```{r, echo=FALSE}
htmltools::includeCSS(params$css)
```
```{r results="asis", echo=FALSE}
cat(paste0("
<style>
#TOC {
background-image: url(\"", knitr::image_uri(params$logo), "\");
}
</style>
"))
```
<!-- Include PI/contact info if provided -->
```{r, results='asis', echo=F, eval=!is.null(params$report_contributors)}
contributors <- gsub("\n", "<br>", params$report_contributors, fixed = TRUE)
contributors <- lapply(simpleSplit(contributors, ";"), function(s) {
splt <- simpleSplit(s, "<br>")
paste0("**", head(splt, 1), "**<br>", paste(tail(splt, -1), collapse = "<br>"))
})
for (r in seq_along(contributors)) {
if (r %% 2 == 1) cat("<div class='div-row'>")
cat(paste0("<div class='div-column'>", contributors[r], "</div>"))
if (r %% 2 == 0 || r == length(contributors)) cat("</div>")
}
```
<!-- Load input data -->
```{r, echo=FALSE}
observations <- read_metadata(file.path(params$input_dir, params$observations), id_col = params$observations_id_col)
observations_name_col <- ifelse(!is.null(params$observations_name_col), params$observations_name_col, params$observations_id_col)
if (! observations_name_col %in% colnames(observations)){
stop(paste('Invalid observation name column specified: ', observations_name_col, paste0('(Valid values are: ', paste(colnames(observations), collapse=', '),')')))
}
if (! is.null(params$features)){
features <- read_metadata(file.path(params$input_dir, params$features))
if (! is.null(params$features_metadata_cols)){
features <- features[,colnames(features) %in% simpleSplit(params$features_metadata_cols), drop = FALSE]
}
}
contrasts <- read_metadata(file.path(params$input_dir, params$contrasts_file))
contrasts$blocking <- na.replace(contrasts$blocking, '')
if (! 'id' %in% colnames(contrasts)){
contrasts$id <- apply(contrasts, 1, paste, collapse='_')
}
# Identify informative variables- those with a number of values greater than 1
# but less than N, with N being the number of observations. Make sure contrast
# variables are first in the list
informative_variables <- unique(c(contrasts$variable, chooseGroupingVariables(observations)))
# Remove any informative variables that group observations the same way
informative_variables <- informative_variables[ ! duplicated(lapply(structure(informative_variables, names= informative_variables), function(x) as.numeric(factor(observations[[x]], levels=unique(observations[[x]])))))]
assay_names <- simpleSplit(params$exploratory_assay_names)
names(assay_names) = assay_names
assay_files <- lapply(assay_names, function(x) params[[paste0(x, '_matrix')]])
assay_data <- lapply(assay_files, function(x) {
mat <- na.omit(
read_matrix(
x,
sample_metadata = observations,
row.names = 1
)
)
colnames(mat) <- observations[[observations_name_col]][match(colnames(mat), rownames(observations))]
mat
})
log2_assays <- params$exploratory_log2_assays
if (!is.null(log2_assays)) {
# Remove brackets from assay list. TODO: remove if this is added to cond_log2_transform_assays
log2_assays <- gsub('\\]$', '', gsub('^\\[', '', log2_assays))
}
assay_data <- cond_log2_transform_assays(assay_data, log2_assays, prettify_names = FALSE)
# Now we can rename the observations rows using the title field
rownames(observations) <- observations[[observations_name_col]]
# Run PCA early so we can understand how important each variable is
pca_datas <- lapply(names(assay_data), function(assay_type){
compilePCAData(assay_data[[assay_type]])
})
names(pca_datas) <- names(assay_data)
pca_vs_meta <- anova_pca_metadata(pca_datas[[params$exploratory_final_assay]]$coords, observations[,informative_variables, drop = FALSE], pca_datas[[params$exploratory_final_assay]]$percentVar)
# Show the variable with the tightest PC associations first
informative_variables <- rownames(pca_vs_meta)[order(pca_vs_meta[,1])]
# Pick the variable used for coloring purposes etc
if (params$exploratory_main_variable == 'contrasts'){
main_grouping_variable <- contrasts$variable[1]
}else if (params$exploratory_main_variable == 'auto_pca'){
main_grouping_variable <- informative_variables[1]
}else{
if (! params$exploratory_main_variable %in% colnames(observations)){
stop(paste('Invalid main variable specified: ', params$exploratory_main_variable))
}
main_grouping_variable <- params$exploratory_main_variable
}
# Make sure the main variable is shown first, with remaining shown in order of
# informativeness
informative_variables <- unique(c(main_grouping_variable, informative_variables))
groupColorScale <- makeColorScale(length(unique(observations[[main_grouping_variable]])), palette = params$exploratory_palette_name)
```
<!-- Read the differential results.
NOTE: differential results files are expected to have the pattern:
<variable>-<reference>-<target>-<blocking><differential_file_suffix>, e.g.
treatment-mCherry-hND6-batcheffect.deseq2.results.tsv
... where variable, reference, target and blocking come from the contrasts file
(with blocking being optional) and the suffix is defined in parameters.
-->
```{r, echo=FALSE}
differential_file_suffix <- params$differential_file_suffix
if (is.null(differential_file_suffix)) {
differential_file_suffix <- ifelse(params$study_type %in% c('rnaseq'), ".deseq2.results.tsv", ".limma.results.tsv")
}
differential_files <- lapply(contrasts$id, function(d){
file.path(params$input_dir, paste0(gsub(' |;', '_', d), differential_file_suffix))
})
# Initialize vector to store warning messages before merging tables
warnings_list <- c()
# Read differential results and merge with features table
results <- lapply(differential_files, function(diff_file) {
if (!file.exists(diff_file)) stop(paste("Differential file", diff_file, "does not exist"))
diff <- read_differential(
diff_file,
feature_id_column = params$differential_feature_id_column,
fc_column = params$differential_fc_column,
pval_column = params$differential_pval_column,
qval_column = params$differential_qval_column
)
# Log transform fold changes if not already logged
if (!params$differential_foldchanges_logged) {
diff[[params$differential_fc_column]] <- log2(diff[[params$differential_fc_column]])
}
# Annotate differential table if features table is provided
if (!is.null(params$features)) {
## Merge Differential expression table on features table
merged <- merge(features, diff, by.x = params$features_id_col, by.y = params$differential_feature_id_column)
## Get number of missing rows
n_missing <- length(setdiff(diff[[params$differential_feature_id_column]], merged[[params$features_id_col]]))
## Create warnings if necessary
warnings <- c(
## Missing IDs
if (n_missing > 0) sprintf(
'<p style="color:#DAA520;"><strong>WARNING:</strong> %d IDs from the differential table (%s) were lost on merge with features table (%s).</p>',
n_missing, basename(diff_file), basename(params$features)
),
## Check whether there are fewer rows, missing data
if (nrow(merged) < nrow(diff)) sprintf(
'<p style="color:#DAA520;"><strong>WARNING:</strong> Rows were lost on merge (%s -> %s). Original: %d, Merged: %d.</p>',
basename(diff_file), basename(params$features), nrow(diff), nrow(merged)
),
## Check whether there are more rows, possible duplications
if (nrow(merged) > nrow(diff)) sprintf(
'<p style="color:#DAA520;"><strong>WARNING:</strong> Rows were duplicated on merge (%s -> %s). Original: %d, Merged: %d.</p>',
basename(diff_file), basename(params$features), nrow(diff), nrow(merged)
)
)
} else {
merged <- diff
warnings <- character(0)
}
## Collect results
list(diff_features = merged, warnings = warnings)
})
# Separate differential_results and warnings_list from results
differential_results <- lapply(results, `[[`, "diff_features")
warnings_list <- unlist(lapply(results, `[[`, "warnings"))
names(differential_results) <- contrasts$id
```
<!-- Calculate some summary statistics -->
```{r, echo=FALSE}
# Function to make friendly contrast name from contrast components, including optional bits
name_contrast <- function(i){
contrast_name <- paste(contrasts$target[i], 'versus', contrasts$reference[i], 'in', contrasts$variable[i])
contrast_vals <- contrasts[i,]
populated <- colnames(contrasts)[! (is.na(contrast_vals) | contrast_vals == '' | is.null(contrast_vals))]
optional <- setdiff(populated, c('id', 'target', 'reference', 'variable'))
if (length(optional) > 0){
optional_part <- paste0('(', paste(paste(optional, contrasts[i,optional], sep=': '), collapse=', '), ')')
}else{
optional_part <- ''
}
paste(contrast_name, optional_part)
}
contrast_descriptions <- unlist(lapply(1:nrow(contrasts), function(x) name_contrast(x)))
# Check both adjusted and unadjusted p values
p_value_types <- list(Adjusted = params$differential_qval_column, Unadjusted = params$differential_pval_column)
p_value_thresholds <- list(Adjusted = params$differential_max_qval, Unadjusted = params$differential_max_pval)
sig_differential <-
lapply(names(p_value_types), function(pvt){
diff <- lapply(
1:nrow(contrasts),
function(x){
signif <- differential_results[[x]][,p_value_types[[pvt]] ] < p_value_thresholds[[pvt]]
list(
up = differential_results[[x]][which(
differential_results[[x]][,params$differential_fc_column ] > log2(params$differential_min_fold_change) &
signif
),],
down = differential_results[[x]][which(
differential_results[[x]][,params$differential_fc_column ] < log2(1/params$differential_min_fold_change) &
signif
),]
)
}
)
names(diff) <- contrast_descriptions
diff
})
names(sig_differential) <- names(p_value_types)
# Count the differential genes
differential_tables <- lapply(names(sig_differential), function(sd) do.call(rbind, lapply(sig_differential[[sd]], function(x) lapply(x, function(y) nrow(y)))))
names(differential_tables) <- names(sig_differential)
```
<!-- Write the report -->
# Abstract
This report summarises differential `r params$features_type` analysis as performed by the nf-core/differentialabundance pipeline.
# Data
```{r, echo=FALSE, results='asis'}
cat(paste0("\n## ", ucfirst(params$observations_type), "s\n"))
```
A summary of `r params$observations_type` metadata is below:
```{r, echo=FALSE, results='asis'}
display_columns <- union(c(params$observations_id_col, unique(contrasts$variable)), informative_variables)
minimal_fetchngs_cols <- c('sample', 'sample_title', 'strandedness', 'library_strategy', 'scientific_name')
# If the data came via fetchngs then we can infer a couple of things about the most useful columns
if (all(minimal_fetchngs_cols %in% colnames(observations))){
additional_useful_cols <- minimal_fetchngs_cols
}else{
additional_useful_cols <- colnames(observations)[which(apply(observations, 2, function(x) max(nchar(x))) <= 20)]
}
display_columns <- head(union(display_columns, additional_useful_cols), 5)
# Also add informative columns
display_columns <- unique(c(display_columns, informative_variables))
observations_to_print <- observations[,unique(display_columns)]
colnames(observations_to_print) <- prettifyVariablename(colnames(observations_to_print))
print( htmltools::tagList(datatable(observations_to_print, caption = paste(ucfirst(params$observations_type), 'metadata'), rownames = FALSE, options = list(dom = 'tp')) ))
```
## Contrasts
Comparisons were made between `r params$observations_type` groups defined using `r params$observation_type` metadata columns, as described in the following table of contrasts:
```{r, echo=FALSE, results='asis'}
contrasts_to_print <- contrasts
colnames(contrasts_to_print) <- prettifyVariablename(colnames(contrasts_to_print))
# Add design/model formulae to report
de_tool <- ifelse(params$study_type %in% c('rnaseq'), "deseq2", "limma")
contrasts_to_print$model <- sapply(contrasts_to_print$Id, function(id) {
model_file <- paste0(id, ".", de_tool, ".model.txt")
if (file.exists(model_file)) {
first_line <- readLines(model_file, n = 1)
return(first_line)
} else {
return(NA)
}
})
print( htmltools::tagList(datatable(contrasts_to_print, caption = paste0("Table of contrasts"), rownames = FALSE, options = list(dom = ifelse(nrow(contrasts_to_print) > 10, 'tp', 't'))) ))
```
# Results
## Counts
Input was a matrix of `r nrow(assay_data$raw)` `r params$features_type`s for `r ncol(assay_data$raw)` `r params$observations_type`s`r ifelse(nrow(assay_data$normalised) < nrow(assay_data$raw), paste0(', reduced to ', nrow(assay_data$normalised), ' ', params$features_type, 's after filtering for low abundance'), '')`.
## Exploratory analysis
### Abundance value distributions
The following plots show the abundance value distributions of input matrices. A log2 transformation is applied where not already performed.
#### Box plots
```{r, echo=FALSE, results='asis', fig.height=8}
p <- ggplot_boxplot(
assay_data,
experiment = observations,
colorby = main_grouping_variable,
expressiontype = paste("count per", params$features_type),
palette = groupColorScale,
whisker_distance = params$exploratory_whisker_distance,
base_size=8
)
print(p)
```
Whiskers in the above boxplots show `r params$exploratory_whisker_distance` times the inter-quartile range.
#### Density plots
```{r, echo=FALSE, results='asis', fig.height=8}
plotly_densityplot(
assay_data,
experiment = observations,
colorby = observations_name_col,
expressiontype = paste("count per", params$features_type),
makeColorScale(length(unique(observations[[params$observations_id_col]])), palette = "Set1")
)
```
```{r, echo=FALSE, results='asis'}
cat(paste0("\n### ", ucfirst(params$observations_type), " relationships\n"))
```
#### Principal components plots {.tabset}
Principal components analysis was conducted based on the `r params$exploratory_n_features` most variable `r params$features_type`s. Each component was annotated with its percent contribution to variance.
```{r, echo=FALSE, results='asis'}
# Create nested list to save the percentVars for reusing in the scree plot
percentVar_list <- list()
for (assay_type in rev(names(assay_data))){
pca_data <- pca_datas[[assay_type]]
for (iv in informative_variables){
cat(paste0("\n##### ", prettifyVariablename(assay_type), " (", iv, ")\n"))
plotdata <- pca_data$coords
plotdata$colorby <- factor(
observations[[iv]],
levels = unique(observations[[iv]])
)
pcaColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name)
# Make plotting data combining PCA coords with coloring groups etc
plotdata$name <- rownames(plotdata)
percentVar <- pca_data$percentVar
labels <- paste0(colnames(plotdata), " (", sprintf("%.1f", percentVar), "%)")
ncats <- length(unique(plotdata$colorby))
plot_types <- list("2" = "scatter", "3" = "scatter3d")
for (d in names(plot_types)) {
# Default plot args whatever we're doing
plot_args <- list(
x = pca_data$coords[, 1],
y = pca_data$coords[, 2],
xlab = labels[1],
ylab = labels[2],
colorby = plotdata$colorby,
plot_type = plot_types[[d]],
palette = pcaColorScale,
legend_title = prettifyVariablename(iv),
labels = plotdata$name,
show_labels = TRUE
)
if (d == "3") {
plot_args$z <- pca_data$coords[, 3]
plot_args$zlab <- labels[3]
}
print(htmltools::tagList(do.call("plotly_scatterplot", plot_args)))
}
if (! assay_type %in% names(percentVar_list)){
percentVar_list[[assay_type]] <- percentVar
}
}
}
```
```{r, echo=FALSE, results='asis', eval=params$report_scree}
cat(paste0("\n#### Scree plot {.tabset}"))
cat(paste0("\nThe following scree plot visualizes what percentage of total variation in the data can be explained by each of the principal components computed.\n"))
#iv <- informative_variables[1]
for (assay_type in names(percentVar_list)) {
percentVarData <- data.frame(percentVar_list[[assay_type]])
colnames(percentVarData) <- c("var_explained")
percentVarData$PCA <- as.numeric(rownames(percentVarData))
cat(paste0("\n##### ", prettifyVariablename(assay_type), "\n"))
print(
ggplot(percentVarData, aes(x=factor(PCA),y=var_explained, group=1)) +
theme_bw() +
geom_point(size=4) +
geom_line(linetype="dashed") +
xlab("PC") +
ylab("Percent variance explained")
)
cat("\n")
}
```
#### Principal components/ metadata associations
For the variance stabilised matrix, an ANOVA test was used to determine assocations between continuous principal components and categorical covariates (including the variable of interest).
The resulting p values are illustrated below.
```{r, echo=FALSE, results='asis'}
# This is a little hack to work around a bug in d3heatmap with single-row data
# frames.
if (nrow(pca_vs_meta) == 1){
plot_pca_meta <- rbind(pca_vs_meta, pca_vs_meta)
}else{
plot_pca_meta <- pca_vs_meta
}
d3heatmap::d3heatmap(
-log10(plot_pca_meta),
Rowv = FALSE,
dendrogram = 'none',
cellnote = plot_pca_meta,
cexCol = 0.8,
cexRow = 0.8,
height = (100 + (15 * nrow(plot_pca_meta))),
colors = colorRampPalette(
rev(
RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")
)
)(100)
)
for (variable in rownames(pca_vs_meta)){
sig_comps <- pca_vs_meta[variable,] < 0.1
if (any(sig_comps)){
min_sig_comp <- min(which(sig_comps))
min_sig_comp_p <- sprintf("%.2f", pca_vs_meta[variable, min_sig_comp])
cat(paste0('The variable \'', variable, '\' shows an association with ', colnames(pca_vs_meta)[min_sig_comp], ' (p = ', min_sig_comp_p,'). '))
}
}
```
#### Clustering dendrograms {.tabset}
A hierarchical clustering of `r params$features_type`s was undertaken based on `r ifelse(params$exploratory_n_features == -1, paste0("all ", params$features_type), paste0("the ", params$exploratory_n_features, " most variable ", params$features_type))`s. Distances between `r params$features_type`s were estimated based on `r params$exploratory_cor_method` correlation, which were then used to produce a clustering via the `r params$exploratory_clustering_method` method with `hclust()` in R.
```{r, echo=FALSE, results='asis'}
for (assay_type in rev(names(assay_data))){
for (iv in informative_variables){
cat(paste0("\n##### ", prettifyVariablename(assay_type), " (", iv, ")\n"))
variable_genes <- selectVariableGenes(matrix = assay_data[[assay_type]], ntop = ifelse(params$exploratory_n_features == -1, nrow(assay_data[[assay_type]]), params$exploratory_n_features))
dendroColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name)
p <- clusteringDendrogram(
2^assay_data[[assay_type]][variable_genes, ],
observations[, iv, drop = FALSE],
colorby = iv,
cor_method = params$exploratory_cor_method,
plot_title = paste0(
paste0(params$observations_type," clustering dendrogram, "),
ifelse(params$exploratory_n_features == -1, nrow(assay_data[[assay_type]]), paste0(params$exploratory_n_features, " most variable")), " ",
params$features_type,
"s\n(", params$exploratory_clustering_method, " clustering, ", params$exploratory_cor_method, " correlation)"),
cluster_method = params$exploratory_clustering_method,
palette = dendroColorScale,
labelspace = 0.25
)
# Defaults in shinyngs make the text in this plot a bit big for the report, so
# scale it down a bit
print(p, vp=grid::viewport(gp=grid::gpar(cex=0.7)))
cat("\n")
}
}
```
```{r, echo=FALSE, results='asis', warning=FALSE}
# We can't look for ouliers in sets of less than 3 samples, so exclude variables
# unless the minimum group size is larger than that
iv_min_group_sizes <- unlist(lapply(informative_variables, function(x) min(table(observations[[x]]))))
if (any(iv_min_group_sizes > 2)){
cat("\n### Outlier detection {.tabset}\n")
cat("\nOutlier detection based on [median absolute deviation](https://archive.ph/o3thZ) was undertaken, the outlier scoring is plotted below. For more on MAD, see [this wiki article](https://en.wikipedia.org/wiki/Median_absolute_deviation).\n")
}
foo <- lapply(informative_variables[iv_min_group_sizes > 2], function(iv){
cat(paste("\n####", iv, "\n"))
plotdata <-
madScore(
matrix = assay_data[[params$exploratory_final_assay]],
sample_sheet = observations,
groupby = iv
)
if (! is.null(plotdata)){
mad_plot_args <- list(
x = plotdata$group,
y = plotdata$mad,
color = plotdata$outlier,
hline_thresholds = c("Outlier threshold" = params$exploratory_mad_threshold),
palette = makeColorScale(2, palette = params$differential_palette_name),
legend_title = "Outlier status",
labels = rownames(plotdata),
show_labels = TRUE,
xlab = "Sample group",
ylab = "MAD score"
)
print(htmltools::tagList(do.call("plotly_scatterplot", mad_plot_args)))
outliers <- rownames(plotdata)[plotdata$outlier]
if (length(outliers) == 0){
cat(paste0("No outlying samples were detected in groups defined by ", iv,".\n"))
}else{
cat(paste0(length(outliers), ' possible outliers were detected in groups defined by ', iv ,': ', paste(outliers, collapse=', '), "\n"))
}
}
})
```
## Differential analysis
```{r, echo=FALSE, results='asis', eval=params$study_type %in% c('rnaseq')}
# For DESeq2, add some more explanation to the report
cat(paste0(
"The `DESeq2 R` package was used for differential analysis. p-values were adjusted with the ", params$deseq2_p_adjust_method, " method to reduce the number of false positives. ", ucfirst(params$features_type), "s were considered differential if, for the respective contrast, the adjusted p-value was equal to or lower than ", params$deseq2_alpha, " and the absolute log2 fold change was equal to or higher than ", params$deseq2_lfc_threshold, "."
))
```
### Differential `r params$features_type` `r params$study_abundance_type` {.tabset}
```{r, echo=FALSE, results='asis'}
foo <- lapply(names(p_value_types), function(pvt){
cat("\n#### ", pvt, "\n")
print( htmltools::tagList(datatable(differential_tables[[pvt]], caption = paste0('Differential ', params$features_type, " ", params$abundance_type, ' (target relative to reference)'), options = list(dom = 't'), rownames = TRUE) ))
cat("\n")
})
```
```{r, echo=FALSE, results='asis', eval = FALSE}
differential_summary_string <- paste(
paste(
lapply(
1:nrow(contrasts),
function(x){
paste0(
"Contrast ", x, ' (', contrast_descriptions[x], ') ', "had ", differential_table[x,'up'], ' ', paste0(params$features_type, 's'), ' expressed significantly more highly in ', contrasts[x, 'target',], ' than ', contrasts[x, 'reference',], ' and ', differential_table[x,'down'], ' expressed at sifnificantly lower levels.'
)
}
),
collapse = ' '
)
)
cat(differential_summary_string)
```
### Differential `r params$features_type` details
```{r, echo=FALSE, results='asis', warning=FALSE, message=FALSE}
# Display all warnings related to number of rows
if (length(warnings_list) > 0) {
for (warning in warnings_list) { cat(warning) }
}
for (i in 1:nrow(contrasts)){
cat("\n#### ", contrast_descriptions[i], " {.tabset}\n")
## Make a volcano plot for the contrast first
# Label features with symbol as well as identifier
if (! is.null(params$features) && (! is.null(params$differential_feature_name_column)) ){
label_col <- params$differential_feature_name_column
}else{
label_col <- params$differential_feature_id_column
}
# Get the full set of differential stats for this contrast, removing rows with
# NAs in the fields we need.
full_de <- differential_results[[i]]
full_de <- subset(full_de, (! is.na(full_de[[params$differential_fc_column]])) & (! is.na(full_de[[params$differential_qval_colum]])) )
# We'll color by whether features are differential according to supplied thresholds
p_value_types <- list(Adjusted = params$differential_qval_column, Unadjusted = params$differential_pval_column)
p_value_thresholds <- list(Adjusted = params$differential_max_qval, Unadjusted = params$differential_max_pval)
for (pvt in names(p_value_types)){
cat("\n##### ", pvt, " p values\n")
pval_column <- p_value_types[[pvt]]
de_fc <- abs(full_de[[params$differential_fc_column]]) >= abs(log2(params$differential_min_fold_change))
de_fc_label <- paste("abs(logFC) >=", log2(params$differential_min_fold_change))
de_pval <- full_de[[pval_column]] <= p_value_thresholds[[pvt]]
de_pval_label <- paste(pvt, "<=", p_value_thresholds[[pvt]])
de_pval_fc_label <- paste(de_fc_label, '&', de_pval_label)
full_de$differential_status <- "Not significant"
full_de$differential_status[de_fc] <- de_fc_label
full_de$differential_status[de_pval] <- de_pval_label
full_de$differential_status[de_fc & de_pval] <- de_pval_fc_label
full_de$differential_status <- factor(full_de$differential_status, levels = c("Not significant", de_fc_label, de_pval_label, de_pval_fc_label), ordered = TRUE) # Factorize status so that non-significant is always first
# Define the thresholds we'll draw
hline_thresholds = vline_thresholds = list()
hline_thresholds[[paste(pval_column, '=', p_value_thresholds[[pvt]])]] = -log10(p_value_thresholds[[pvt]])
vline_thresholds[[paste(params$differential_fc_column, '<=', log2(params$differential_min_fold_change))]] = -log2(params$differential_min_fold_change)
vline_thresholds[[paste(params$differential_fc_column, '>=', log2(params$differential_min_fold_change))]] = log2(params$differential_min_fold_change)
palette_volcano <- append(c('#999999'), makeColorScale(3, params$differential_palette_name)) # set non-significant to gray
plot_args <- list(
x = full_de[[params$differential_fc_column]],
y = -log10(full_de[[pval_column]]),
colorby = full_de$differential_status,
ylab = paste("-log(10)", pval_column),
xlab = xlabel <- paste("higher in", contrasts$reference[i], " <<", params$differential_fc_column, ">> higher in", contrasts$target[i]),
labels = full_de[[label_col]],
hline_thresholds = hline_thresholds,
vline_thresholds = vline_thresholds,
show_labels = FALSE,
legend_title = "Differential status",
palette = palette_volcano
)
# Let's equalize the axes
max_fc <- max(abs(full_de[[params$differential_fc_column]])) * 1.1
# Print warning if any p values are 0
zero_p <- length(which(full_de[[pval_column]]==0))
if (zero_p) {
cat(paste0("<i>", zero_p, " feature", ifelse(zero_p>1, "s are", " is"), " not shown because of p value = 0; please refer to the results tables.</i><br><br>"))
}
p <- do.call(plotly_scatterplot, plot_args) %>%
layout(xaxis = list(range=list(-max_fc, max_fc)))
print(htmltools::tagList(p))
## ... then show tables of the up/ down genes
for (dir in c('up', 'down')){
contrast_de <- sig_differential[[pvt]][[i]][[dir]]
cols_to_round <- c(params$differential_fc_column, params$differential_pval_column, params$differential_qval_column)
contrast_de[, cols_to_round] <- signif(contrast_de[, cols_to_round], 8)
colnames(contrast_de) <- prettifyVariablename(colnames(contrast_de))
if (nrow(contrast_de) > 0){
contrast_de <- round_dataframe_columns(contrast_de, digits=params$report_round_digits)
print( htmltools::tagList(datatable(contrast_de, caption = paste('Differential genes', dir, 'in', contrast_descriptions[i], " (check", differential_files[[i]], "for more detail)"), rownames = FALSE) ))
if ("Gene biotype" %in% colnames(contrast_de)) {
# Plot Differentially Expressed Genes by Gene Biotype
gene_biotype_table <- contrast_de %>%
group_by(`Gene biotype`) %>%
summarise(count = dplyr::n(), .groups = 'drop') %>%
filter(count > 0) %>%
arrange(desc(count))
gene_biotype_plot <- ggplot(gene_biotype_table, aes(x = reorder(`Gene biotype`, -count), y = count)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(
title = paste0("Differentially Expressed Genes by Gene Biotype (", dir, ")"),
x = "Gene Biotype",
y = "Number of Differentially Expressed Genes" ) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(gene_biotype_plot)
} else {
cat("Column 'Gene biotype' does not exist. Skipping plot.\n")
}
}else{
cat(paste0("No significantly differential '", dir, "' genes.\n\n"))
}
}
}
}
```
<!-- Gene set analysis results -->
```{r, echo=FALSE, results='asis'}
possible_gene_set_methods <- c('gsea', 'gprofiler2')
if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){
cat("\n### Gene set analysis\n")
for (gene_set_method in possible_gene_set_methods){
if (unlist(params[paste0(gene_set_method, '_run')])){
cat("\n#### ", toupper(gene_set_method) ," {.tabset}\n")
if (gene_set_method == 'gsea') {
for (gmt_file in simpleSplit(params$gene_sets_files)) {
gmt_name <- basename(tools::file_path_sans_ext(gmt_file))
cat("\n##### ", gmt_name ," {.tabset}\n")
reference_gsea_tables <- paste0(contrasts$id, ".", gmt_name, '.gsea_report_for_', contrasts$reference, '.tsv')
target_gsea_tables <- paste0(contrasts$id, ".", gmt_name, '.gsea_report_for_', contrasts$target, '.tsv')
for (i in 1:nrow(contrasts)){
cat("\n###### ", contrast_descriptions[i], "\n")
target_gsea_results <- read_metadata(target_gsea_tables[i])[,c(-2,-3)]
target_gsea_results <- round_dataframe_columns(target_gsea_results, digits=params$report_round_digits)
print( htmltools::tagList(datatable(target_gsea_results, caption = paste0("\nTarget (", contrasts$target[i], ")\n"), rownames = FALSE) ))
ref_gsea_results <- read_metadata(reference_gsea_tables[i])[,c(-2,-3)]
ref_gsea_results <- round_dataframe_columns(ref_gsea_results, digits=params$report_round_digits)
print( htmltools::tagList(datatable(ref_gsea_results, caption = paste0("\nReference (", contrasts$reference[i], ")\n"), rownames = FALSE) ))
}
}
} else if (gene_set_method == 'gprofiler2') {
cat(paste0("\nThis section contains the results tables of the pathway analysis which was done with the R package gprofiler2. The differential fraction is the number of differential genes in a pathway divided by that pathway's size, i.e. the number of genes annotated for the pathway.",
ifelse(params$gprofiler2_significant, paste0(" Enrichment was only considered if significant, i.e. adjusted p-value <= ", params$gprofiler2_max_qval, "."), "Enrichment was also considered if not significant."), "\n"))
# Make sure to grab only non-empty files
for (i in 1:nrow(contrasts)) {