-
Notifications
You must be signed in to change notification settings - Fork 2
/
StatPcp.R
1226 lines (1045 loc) · 59.4 KB
/
StatPcp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#' Parallel coordinate plot for both numeric and categorical data
#'
#' The parallel coordinate plot displays multiple y-axes, and shows the observations across
#' several dimensions as ploi-lines. This function work well with both numeric and categorical
#' variables at the same time after proper scaling.
#'
#' \code{method} is a character string that denotes how to scale the variables
#' in the parallel coordinate plot. Options are named in the same way as the options in `ggparcoord` (GGally):
#' \itemize{
#' \item{\code{raw}}{: raw data used, no scaling will be done.}
#' \item{\code{std}}{: univariately, subtract mean and divide by standard deviation. To get values into a [0,1] interval we use a linear transformation of f(y) = y/4+0.5. }
#' \item{\code{robust}}{: univariately, subtract median and divide by median absolute deviation. To get values into a [0,1] interval we use a linear transformation of f(y) = y/4+0.5. }
#' \item{\code{uniminmax}}{: univariately, scale so the minimum of the variable is zero, and the maximum is one.}
#' \item{\code{globalminmax}}{: gobal scaling; the global maximum is mapped to 1,
#' global minimum across the variables is mapped to 0. }
#' }
#'
#' \code{overplot} is a character string that denotes how to conduct overplotting
#' in the parallel coordinate plot. The lines from \code{geom_pcp()} are drawn according to the order they shown in your data set in default.
#' Note that this argument provides a framework, the order in the original data still has a role in overplotting,
#' especially for lines outside factor blocks(for \code{hierarchical} only), plots with \code{resort} turned on(for methods except \code{hierarchical}):
#' \itemize{
#' \item{\code{original}}{: use the original order, first shown first drawn.}
#' \item{\code{hierarchical}}{: hierarchically drawn according to the combinations of levels of factor variables,
#' which will change according to different level structures of factor variables you provided.
#' This was done separately for each factor block. The right most factor variables have the largest weight across a sequence of factor variables,
#' the last level of a factor variable has the largest weight within a factor variable.
#' Groups of lines with larger weight will be drawn on top. Lines outside of factor blocks still use the original order, which is different from other methods.}
#' \item{\code{smallfirst}}{: smaller groups of lines are drawn first, placing large groups of lines on top.}
#' \item{\code{largefirst}}{: larger groups of lines are drawn first, placing small groups of lines on top.}
#' }
#' @param mapping Set of aesthetic mappings created by [aes()] or
#' [aes_()]. If specified and `inherit.aes = TRUE` (the
#' default), it is combined with the default mapping at the top level of the
#' plot. You must supply `mapping` if there is no plot mapping.
#' @param data The data to be displayed in this layer. There are three
#' options:
#'
#' If `NULL`, the default, the data is inherited from the plot
#' data as specified in the call to [ggplot()].
#'
#' A `data.frame`, or other object, will override the plot
#' data. All objects will be fortified to produce a data frame. See
#' [fortify()] for which variables will be created.
#'
#' A `function` will be called with a single argument,
#' the plot data. The return value must be a `data.frame`, and
#' will be used as the layer data.
#' @param geom The geometric object to use display the data
#' @param position Position adjustment, either as a string, or the result of
#' a call to a position adjustment function.
#' @param na.rm If `FALSE`, the default, missing values are removed with
#' a warning. If `TRUE`, missing values are silently removed.
#' @param show.legend logical. Should this layer be included in the legends?
#' `NA`, the default, includes if any aesthetics are mapped.
#' `FALSE` never includes, and `TRUE` always includes.
#' It can also be a named logical vector to finely select the aesthetics to
#' display.
#' @param inherit.aes If `FALSE`, overrides the default aesthetics,
#' rather than combining with them. This is most useful for helper functions
#' that define both data and aesthetics and shouldn't inherit behaviour from
#' the default plot specification, e.g. [borders()].
#' @param ... Other arguments passed on to [layer()]. These are
#' often aesthetics, used to set an aesthetic to a fixed value, like
#' `colour = "red"` or `size = 3`. They may also be parameters
#' to the paired geom/stat.
#' @param method string specifying the method that should be used for scaling the values
#' in a parallel coordinate plot (see Details).
#' @param freespace A number in 0 to 1 (excluded). The total gap space among levels within each factor variable
#' @param boxwidth A number or a numeric vector (length equal to the number of factor variables) for the widths of the boxes for each factor variable
#' @param rugwidth A number or a numeric vector (length equal to the number of numeric variables) for the widths of the rugs for numeric variable
#' @param interwidth A number or a numeric vector (length equal to the number of variables minus 1) for the width for the lines between every neighboring variables, either
#' a scalar or a vector.
#' @param resort A integer or a integer vector to indicate the positions of vertical axes inside (can't be the boundary of) a sequence of factors.
#' To break three or more factors into sub factor blocks,
#' and conduct resort at the axes. Makes the plot clearer for adjacent factor variables.
#' @param overplot methods used to conduct overplotting when overplotting becomes an issue.
#' @param reverse reverse the plot, useful especially when you want to reverse the structure in factor blocks,
#' i.e. to become more ordered from right to left
#' @import ggplot2
#' @importFrom dplyr %>% group_by ungroup arrange filter
#' @importFrom tidyr spread
#' @importFrom assertthat assert_that has_name
#' @export
stat_pcp <- function(mapping = NULL, data = NULL,
geom = "segment", position = "identity",
...,
freespace = 0.1,
method = "uniminmax",
boxwidth = 0,
rugwidth = 0,
interwidth = 1,
resort = NULL,
overplot = "hierarchical",
reverse = FALSE,
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
layer(
data = data,
mapping = mapping,
stat = StatPcp,
geom = geom,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
na.rm = na.rm,
method = method,
freespace = freespace,
boxwidth = boxwidth,
rugwidth = rugwidth,
interwidth = interwidth,
resort = resort,
overplot = overplot,
reverse = reverse,
...
)
)
}
#' @rdname stat_pcp
#' @export
StatPcp <- ggproto(
"StatPcp", Stat,
# required_aes = c("vars"), # vars is disappeared by the time we get to the stat
default_aes = ggplot2::aes(
width = 0.75, linetype = "solid", fontsize=5,
shape = 19, colour = "grey30",
size = .1, fill = "grey30", alpha = .8, stroke = 0.1,
linewidth=.1, weight = 1, method = "uniminmax"),
setup_data = function (data, params) {
#browser()
idx <- grep("x__", names(data))
names(data) <- gsub("x__[0-9]+__", "", names(data))
# x__labels__ works together with data.frame(..., check.names = TRUE) (default) to allow spaces in names, also won't add .1 .2 after same variables
# x__labels__ is just for x labels
x__labels__ <- names(data)[idx]
data <- data.frame(data, stringsAsFactors = TRUE)
data <- gather_pcp(data, idx)
data <- transform_pcp(data, method = params$method)
data$x__labels__ <- c(x__labels__, rep("nothing", times = nrow(data) - length(x__labels__)))
data
},
# want to figure out the number of observations
# want to figure out the number of different classer of the variables
### setup_params accept params from stat_pcp or geom_pcp?
setup_params = function(data, params) {
# browser()
params$freespace <- ifelse(is.null(params$freespace), 0.1, params$freespace)
# And other parameters
params
},
compute_layer = function(self, data, params, layout, ...) {
# browser()
# adjust function to avoid deleting all data
ggplot2:::check_required_aesthetics(
self$required_aes,
c(names(data), names(params)),
ggplot2:::snake_class(self)
)
# Trim off extra parameters
params <- params[intersect(names(params), self$parameters())]
scales <- layout$get_scales(data$PANEL[1])
layout$panel_scales_x <- list(xscale_pcp(data, params, layout, ...)) # only one scale overall - might need one for each panel
args <- c(list(data = quote(data), scales = quote(scales)), params)
gg <- ggplot2:::dapply(data, "PANEL", function(data) {
tryCatch(do.call(self$compute_panel, args), error = function(e) {
warning("Computation failed in `", ggplot2:::snake_class(self), "()`:\n",
e$message, call. = FALSE)
ggplot2:::new_data_frame()
})
})
gg
},
# want to calculate the parameters directly can be used for geom_segment and geom_ribbon
compute_panel = function(data, scales, method = "uniminmax", freespace = 0.1, boxwidth = 0,
rugwidth = 0 , interwidth = 1,
resort = NULL, overplot = "original", reverse = FALSE) {
# Input check and sensible warning
#browser()
# Data preparation: to convert the input data to the form we can directly use
# nobs, classpcp will also be used in other places, so they are kept
# If the input data is changed, we might need other ways to do the following part
# but these three values/outputs are required, should be *exact* the same things
# number of observations
obs_ids <- unique(data$id__)
nobs <- length(unique(data$id__))
# a vector to tell the class of variables
classpcp <- data$class[1 - nobs + (1:(nrow(data)/nobs))*nobs]
data_spread <- prepare_data(data, classpcp, nobs)
# at this time, data_spread is like the original data set, with columns properly defined
# assume numeric variables are properly scaled into 0-1
# several possible combinations: num to num, num to factor, factor to num, factor to factor
# we use the function: 'classify' here to do this classification
# if the names of orignal varibles have "id", something might be wrong. update, now "id__"
if (is.character(resort)) {
resort <- which(names(data_spread) %in% resort) - 1
}
# check if resort input is sensible
if(!is.null(resort)){
resort_check <- lapply(resort, FUN = function(x){
output <- !any(classpcp[c(x-1, x, x+1)] %in% c("numeric", "integer")) & (x>1) & (x<length(classpcp))
output
}) %>% unlist()
assertthat::assert_that(all(resort_check), msg = "resort should be interior positions of a sequence of factor variables, e.g. 2 for sequence 1:3")
}
classification <- classify(classpcp, resort = resort)
# for num to num, set up
if (!length(classification$num2num) == 0) {
# set up ystart, yend.(sometimes we plus one to adjust for ID column)
# for ystart of lines (seems we can use unlist to data.frame directly)
data_final_ystart_num2num <- unlist(data_spread[, classification$num2num + 1])
# for yend of lines
data_final_yend_num2num <- unlist(data_spread[, classification$num2num + 2])
# for xstart of lines
data_final_xstart_num2num <- rep(classification$num2num, each = nobs)
# for xend of lines
data_final_xend_num2num <- rep(classification$num2num + 1, each = nobs)
} else {
data_final_ystart_num2num <- NULL
data_final_yend_num2num <- NULL
data_final_xstart_num2num <- NULL
data_final_xend_num2num <- NULL
}
# for num to factor, set up
if (!length(classification$num2fac) == 0) {
# Here I want to treat the factor(categorical) variable as bands according to its levels,
# so I'm not going to treat it as several points. the end points uniformly distributed within each band
# I also want to order those end points landing on the bands somehow
# for ystart of lines(same as num2num, use unlist withour as.list first)
data_final_ystart_num2fac <- unlist(data_spread[, classification$num2fac + 1])
# for yend of lines
# first calculete the number of levels and number of observations landing in each level
nlevels_list <- lapply(data_spread[, classification$num2fac + 2, drop = FALSE],
FUN = function(x) list(nlevels = nlevels(x),
table = table(x)))
# uniformly assign space for each level and observations within each level
# inserted some space between every two levels here, called 'freespace' for the space in total
# obs_position is the postion assigned for factors
obs_position <- assign_fac(nlevels_list, nobs, freespace = freespace)
# for yend of lines, continued
# write another function to arrange the positions of the end
# points according to the ystart, and the order of data
data_final_yend_num2fac <- unlist(arrange_fac_by_ystart(data_spread,
start_position = classification$num2fac + 1,
end_position = classification$num2fac + 2,
obs_position = obs_position))
# for xstart of lines
data_final_xstart_num2fac <- rep(classification$num2fac, each = nobs)
# for xend of lines
data_final_xend_num2fac <- rep(classification$num2fac + 1, each = nobs)
} else {
data_final_ystart_num2fac <- NULL
data_final_yend_num2fac <- NULL
data_final_xstart_num2fac <- NULL
data_final_xend_num2fac <- NULL
}
# for factor to num, set up (this should be similar to num2fac)
if (!length(classification$fac2num) == 0) {
# need to do some adjustments to make the functions above more general and can be used here
# for xstart of lines
data_final_xstart_fac2num <- rep(classification$fac2num, each = nobs)
# for xend of lines
data_final_xend_fac2num <- rep(classification$fac2num + 1, each = nobs)
# for yend of lines
data_final_yend_fac2num <- unlist(data_spread[, classification$fac2num + 2])
# for ystart of lines (mimic the calculation to num2fac, be careful about the difference)
nlevels_list_2 <- lapply(data_spread[, classification$fac2num + 1, drop = FALSE],
FUN = function(x) list(nlevels = nlevels(x),
table = table(x)))
obs_position_2 <- assign_fac(nlevels_list_2, nobs, freespace = freespace)
# here arrange_fac_by_ystart, actually arranges fac (ystart) by yend
data_final_ystart_fac2num <- unlist(arrange_fac_by_ystart(data_spread,
start_position = classification$fac2num + 2,
end_position = classification$fac2num + 1,
obs_position = obs_position_2))
} else {
data_final_xstart_fac2num <- NULL
data_final_xend_fac2num <- NULL
data_final_yend_fac2num <- NULL
data_final_ystart_fac2num <- NULL
}
# we have to make sure those postions are consistent, which are on the same vertical axis, but shared by different pairs
# even if it is consistent(same), which I think is very likely ensured by our consitent method of dealing with variables
# we can still make some improvement above, to save some calculation to avoid twice calculation of same objects
# the only variables, we need to care are factor variables.
# for factor block, set up
# make use of the function to calculate level_range inside assign_fac(),
# and nlevel_lists as before when dealing with factors
# write a function for this part to assign and match the factors,
# we may first calculate a table of the possible combinations between every two factors, and then assign position
# with a constant freespace = 0.1, we make sure the lenghs of area taken are the same among factors
if (!length(classification$fac2fac) == 0) {
# some values needed
# to find the factor block(more than one factor together)
# produce continuous_fac for each factor_block
# 0622new: accomodate to new classification method, continuous_fac_all_raw
continuous_fac_all_raw <- as.vector(rbind(classification$fac2fac, classification$fac2fac + 1))
continuous_fac_all <- continuous_fac_all_raw[c(which(diff(continuous_fac_all_raw) != 0 & diff(continuous_fac_all_raw) != -1 ),
length(continuous_fac_all_raw))]
break_position <- c(0, which(diff(continuous_fac_all) != 1), length(continuous_fac_all))
continuous_fac_all_list <- lapply(1:(length(break_position) - 1), FUN = function(x) {
continuous_fac_all[(break_position[x] + 1):break_position[x + 1]]
})
# detect if there is a numeric variable prior to the factor block, after the factor block
start_fac2fac <- continuous_fac_all[break_position[-length(break_position)] + 1]
end_fac2fac <- continuous_fac_all[break_position[-1]]
# to identify which columns should be used to sort factor blocks
bywhich <- prepare_bywhich(start_fac2fac, classpcp)
if (is.na(bywhich[1])) {
start_position <- as.data.frame(matrix(rep(1:nobs, length(bywhich)), ncol = length(bywhich)))
} else {
start_position <- data_spread[,bywhich + 1,drop = FALSE]
}
# use Map to apply the function to every factor_block
arranged_fac_block <- Map(f = function(x, y) {
process_fac2fac(data_spread = data_spread,
continuous_fac = x,
start_position = y,
freespace = freespace,
nobs = nobs)},
continuous_fac_all_list,
as.data.frame(start_position))
# 0622new
# organize the output correctly into one
data_final_xstart_fac2fac <- unlist(lapply(arranged_fac_block,
FUN = function(x) x[[1]]$data_final_xstart_fac2fac))
data_final_ystart_fac2fac <- unlist(lapply(arranged_fac_block,
FUN = function(x) x[[1]]$data_final_ystart_fac2fac))
data_final_xend_fac2fac <- unlist(lapply(arranged_fac_block,
FUN = function(x) x[[1]]$data_final_xend_fac2fac))
data_final_yend_fac2fac <- unlist(lapply(arranged_fac_block,
FUN = function(x) x[[1]]$data_final_yend_fac2fac))
# also extract the bandid information
data_final_ystart_fac2fac_bandid <- unlist(lapply(arranged_fac_block,
FUN = function(x) x[[2]]$data_final_ystart_fac2fac_bandid))
data_final_yend_fac2fac_bandid <- unlist(lapply(arranged_fac_block,
FUN = function(x) x[[2]]$data_final_yend_fac2fac_bandid))
} else {
data_final_xstart_fac2fac <- NULL
data_final_ystart_fac2fac <- NULL
data_final_xend_fac2fac <- NULL
data_final_yend_fac2fac <- NULL
}
# calculations and modifications for break points in factor blocks
# There are some parts of those modifications above should be integrated to the original calculation for better performance
# is there possibility that this is problem can be sovled using same band assignment at once as in usual case, instead of recursively calculate that?
if ((!is.null(resort)) & (!length(classification$fac2fac) == 0)) {
# identify the groups of sub-factor blocks, labeled for which sub-factor block
sub_fac_block <- which(c(0, end_fac2fac) == c(start_fac2fac, 0))
diff_sub_fac_block <- c(0, which(diff(sub_fac_block) != 1), length(sub_fac_block))
sub_fac_block_list <- lapply(1:(length(diff_sub_fac_block) - 1), FUN = function(x) {
c(sub_fac_block[diff_sub_fac_block[x] + 1] - 1 ,
sub_fac_block[(diff_sub_fac_block[x] + 1):diff_sub_fac_block[x + 1]])
})
# To get the last variable positions and bandids within the first sub-factor block within a group of a sub-factor block
last_y <- lapply(1:length(sub_fac_block_list), FUN = function(x) {
# temp_yend, temp_yendid are used to save some coding for the long formular
temp_yend <- arranged_fac_block[[sub_fac_block_list[[x]][1]]][[1]][["data_final_yend_fac2fac"]]
temp_yendid <- arranged_fac_block[[sub_fac_block_list[[x]][1]]][[2]][["data_final_yend_fac2fac_bandid"]]
# length(temp_yend) = length(temp_yendid)
last_yend <- temp_yend[(length(temp_yend) - nobs + 1):length(temp_yend)]
last_yendid <- temp_yendid[(length(temp_yendid) - nobs + 1):length(temp_yendid)]
output <- list(last_yend = last_yend, last_yendid = last_yendid)
output
})
# use dirty "for loops", we should have a better way of applying this
# the following works for each sub-factors blocks group, we use Map to apply for each group
data_break_y <- Map(f = function(x, y) {
last_yend <- y$last_yend
last_yendid <- y$last_yendid
data_break_ystart <- vector()
data_break_yend <- vector()
data_break_yend_bandid <- vector()
# the following variables used to replace those values for factor block which were calculated before
# actually, we might save some calculation by adding some conditions on the previous calculation
data_replace_block_ystart <- vector()
data_replace_block_yend <- vector()
data_replace_block_xstart <- vector()
data_replace_block_xend <- vector()
for (i in x[-1]) {
# we need to save this ystart here
data_break_ystart <- c(data_break_ystart, last_yend)
# need some modificaation on process_fac2fac to take the exsiting bandid for numeric values into consideration
# and works for the current task for sub-factor blocks
# we shouldn't work on the first sub-factor block
arranged_fac_block_sub<- process_fac2fac(data_spread = data_spread,
continuous_fac = continuous_fac_all_list[[i]],
start_position = last_yend,
freespace = freespace,
nobs = nobs,
subfac = TRUE,
start_bandid = last_yendid)
# save some typing same as in calculate lasy_y
temp_yend <- arranged_fac_block_sub[[1]]$data_final_yend_fac2fac
temp_yendid <- arranged_fac_block_sub[[2]]$data_final_yend_fac2fac_bandid
last_yend <- temp_yend[(length(temp_yend) - nobs + 1):length(temp_yend)]
last_yendid <- temp_yendid[(length(temp_yendid) - nobs + 1):length(temp_yendid)]
# Here, we made a choice to only return the segments between the sub factor block
# Actually, we can return all the lines except the first sub-factor block(we can even also do this by using the original start_position first),
# in that case, we can use these calculation to replace the orignal factor block calculations to remove replicated calculations
# We might do that later, just be careful about the xstart, xend value, since it has 0
# we also don't return xstart, and xend, since they are calculated later
# When use process_fac2fac(subfac = TRUE), the returned xstart, xend positions are not correct, we can change this in that function
# or regenerate afterwards like what I will do in the following, since it is trivil calculation
# the followings are used to replace those values calculated for factor blocks before
data_replace_block_xstart <- c(data_replace_block_xstart, rep(continuous_fac_all_list[[i]][-length(continuous_fac_all_list[[i]])], each = nobs))
data_replace_block_xend <- c(data_replace_block_xend, rep(continuous_fac_all_list[[i]][-length(continuous_fac_all_list[[i]])], each = nobs) + 1)
data_replace_block_ystart <- c(data_replace_block_ystart, temp_yend[1:length(rep(continuous_fac_all_list[[i]][-length(continuous_fac_all_list[[i]])], each = nobs))])
data_replace_block_yend <- c(data_replace_block_yend, temp_yend[-(1:nobs)])
data_break_yend <- c(data_break_yend, temp_yend[1:nobs])
data_break_yend_bandid <- c(data_break_yend_bandid, temp_yendid[1:nobs])
}
data_break_y <- list(data_break_ystart = data_break_ystart,
data_break_yend = data_break_yend,
data_break_yend_bandid = data_break_yend_bandid,
data_replace_block_xstart = data_replace_block_xstart,
data_replace_block_xend = data_replace_block_xend,
data_replace_block_ystart = data_replace_block_ystart,
data_replace_block_yend = data_replace_block_yend)
data_break_y
},
sub_fac_block_list,
last_y)
replace_position_xstart <- unique(unlist(lapply(data_break_y, FUN = function(x) {
x$data_replace_block_xstart
})))
data_final_ystart_fac2fac[data_final_xstart_fac2fac %in% replace_position_xstart] <- unlist(lapply(data_break_y, FUN = function(x) {
x$data_replace_block_ystart
}))
data_final_yend_fac2fac[data_final_xstart_fac2fac %in% replace_position_xstart] <- unlist(lapply(data_break_y, FUN = function(x) {
x$data_replace_block_yend
}))
}
# Modification parts begin--------------------------------------------------
# Because a factor variable can only be ordered according to one side, we do the modification
# Because we arranged both num-fac and factor block separately, we need to overwirte the num-fac when the fac is actually a fac block
# some modification for num2fac_blcok (num2fac, fac is a factor block, more than one factor)
# detect those variables
# consider zero or multiple factor blocks
num2fac_block_relative <- which((classification$num2fac + 1) %in% classification$fac2fac)
num2fac_block_fac_relative <- which((unique(classification$fac2fac) -1) %in% classification$num2fac)
num2fac_change <- unlist(lapply(num2fac_block_relative, FUN = function(x) {
((x - 1)*nobs + 1):(x*nobs)
}))
num2fac_fac_input <- unlist(lapply(num2fac_block_fac_relative, FUN = function(x) {
((x - 1)*nobs + 1):(x*nobs)
}))
# make modification
data_final_yend_num2fac[num2fac_change] <- data_final_ystart_fac2fac[num2fac_fac_input]
# some modification for fac2num_blcok (fac2num, fac is a factor block, more than one factor)
# for multiple factor blocks, and multiple places to modify
# detect those variables
# consider zero or multiple factor blocks
fac2num_block_relative <- which(classification$fac2num %in% (classification$fac2fac + 1))
fac2num_change <- unlist(lapply(fac2num_block_relative, FUN = function(x) {
((x - 1)*nobs + 1):(x*nobs)
}))
fac2num_block_fac_relative <- which((unique(classification$fac2fac) + 1) %in% classification$fac2num)
fac2num_fac_input <- unlist(lapply(fac2num_block_fac_relative, FUN = function(x) {
((x - 1)*nobs + 1):(x*nobs)
}))
# make modification
data_final_ystart_fac2num[fac2num_change] <- data_final_yend_fac2fac[fac2num_fac_input]
# some other modification for fac2num (for num-fac-num case)
# we don't need to adjust fac2num for num-fac-num case, since it is adjusted by num2fac, to keep consistent
# so we need to modify it back
# detect those variables
fac2num_numfacnum <- classification$fac2num[classification$fac2num %in% (classification$num2fac + 1)]
fac2num_numfacnum_relative <- which(classification$fac2num %in% fac2num_numfacnum)
num2fac_numfacnum_relative <- which(classification$num2fac %in% (fac2num_numfacnum - 1))
if ((length(fac2num_numfacnum_relative) != 0)&(length(num2fac_numfacnum_relative) != 0)){
# for ystart of lines, keep it same as the num2fac result; this is the only thing that should be corrected
# data_final_ystart_fac2num[((fac2num_numfacnum_relative - 1)*nobs + 1):(fac2num_numfacnum_relative*nobs)] <-
# data_final_yend_num2fac[((num2fac_numfacnum_relative - 1)*nobs + 1):(num2fac_numfacnum_relative*nobs)]
if (length(fac2num_numfacnum_relative) != length(num2fac_numfacnum_relative)) stop("unexpected error in fac-num-fac case, please report/fix")
for (i in seq_along(fac2num_numfacnum_relative)) {
data_final_ystart_fac2num[((fac2num_numfacnum_relative[i] - 1)*nobs + 1):(fac2num_numfacnum_relative[i]*nobs)] <-
data_final_yend_num2fac[((num2fac_numfacnum_relative[i] - 1)*nobs + 1):(num2fac_numfacnum_relative[i]*nobs)]
}
}
# put everything together
# DO NOT change the order for num2num, num2fac, fac2num, fac2fac, they are used in the creation of id column of data_boxwidth
data_final_xstart <- c(data_final_xstart_num2num,
data_final_xstart_num2fac,
data_final_xstart_fac2num,
data_final_xstart_fac2fac)
data_final_xend <- c(data_final_xend_num2num,
data_final_xend_num2fac,
data_final_xend_fac2num,
data_final_xend_fac2fac)
data_final_ystart <- c(data_final_ystart_num2num,
data_final_ystart_num2fac,
data_final_ystart_fac2num,
data_final_ystart_fac2fac)
data_final_yend <- c(data_final_yend_num2num,
data_final_yend_num2fac,
data_final_yend_fac2num,
data_final_yend_fac2fac)
data_final <- data.frame(x = data_final_xstart,
xend = data_final_xend,
y = data_final_ystart,
yend = data_final_yend)
# This has different length from the original data coming to compute_panel
# interval length, boxwidth, rugwidth ajustment preparation
width_adjusted <- prepare_width_ajustment(classpcp, boxwidth, rugwidth, interwidth, reverse = reverse)
# adjust the data according to the adjusted position
data_boxwidth <- data_final
data_boxwidth$x <- width_adjusted$boxwidth_xend[data_boxwidth$x]
data_boxwidth$xend <- width_adjusted$boxwidth_xstart[data_boxwidth$xend]
# add parallel lines(segments, rugs) after adjusted for boxwidth
# add segments for boxes
segment_which <- which(!(data_final$x %in% resort))
segment_xstart <- data_final$x[segment_which]
data_segment_xstart <- width_adjusted$boxwidth_xstart[segment_xstart]
data_segment_xend <- width_adjusted$boxwidth_xend[segment_xstart]
data_segment_ystart <- data_final$y[segment_which]
data_segment_yend <- data_segment_ystart
# for the last variable seperately
data_segment_xstart <- c(data_segment_xstart, width_adjusted$boxwidth_xstart[rep(length(classpcp), nobs)])
data_segment_xend <- c(data_segment_xend, width_adjusted$boxwidth_xend[rep(length(classpcp), nobs)])
data_segment_ystart <- c(data_segment_ystart,
data_final[which(data_final$xend == length(classpcp)), "yend"])
data_segment_yend <- data_segment_ystart
if (!is.null(resort)) {
# lines for the break points
# a naive try, This is one choice(no sorted lines)
resort_which <- which(data_final$x %in% resort)
resort_xstart <- data_final$x[resort_which]
data_break_xstart <- width_adjusted$boxwidth_xstart[resort_xstart]
data_break_xend <- width_adjusted$boxwidth_xend[resort_xstart]
# we need to distinguish between the replicated variables, also used in above modifications
# is this safe?: using different selections, can they match each other(in this break case, they are all inside factor blocks, they should match)
# data_break_ystart <- data_final$yend[which(data_final$xend %in% resort)]
# data_break_yend <- data_final$y[resort_which]
# we need to make sure the the x, y, xend, yend match each other, now it is ensured by that the calculations in fac2fac related chuncks
data_break_ystart <- unlist(lapply(data_break_y, FUN = function(x) {
x$data_break_ystart
}))
data_break_yend <- unlist(lapply(data_break_y, FUN = function(x) {
x$data_break_yend
}))
data_break_yend_bandid <- unlist(lapply(data_break_y, FUN = function(x) {
x$data_break_yend_bandid
}))
}
# !!!Notice: the following reorder part, only ajusts the order of drawing(geom_segment draws according to the order in the data), and doesn't mean to affect anything else.
# To reorder the observations for factor blocks(seperate for each block) to address undesired overlap
# We need to work here, because the some parts(most parts to do modifications between different sections) rely on the orginal order
# We will reorder the main part first, and then the part inside resort, which used different line arrangement method.
# We are benefited from:
# 1. the work of keeping bandid(cluster id) for factor blocks
# 2. the fact that bandid is hiearchically assigned (some magic is used inside bandid() helper function)
# The resorts make the case more complex, we will see how it will finally work out.(how bandid is carried over resort?)
# we need to reorder the obs_id for those parts too later.
# parallel segments inside boxes are created very late! (after we have data_final, and is based on that)
assert_that(!is.null(overplot))
assert_that(overplot %in% c("original", "hierarchical", "smallfirst", "largefirst"))
if (overplot == "hierarchical") {
turn_on_off <- TRUE
} else {
turn_on_off <- FALSE
}
if ((!length(classification$fac2fac) == 0) & turn_on_off) {
# data_final_ystart_fac2fac_bandid and data_final_yend_fac2fac_bandid are actually same
data_final_ystart_fac2fac_split <- split(data_final_ystart_fac2fac, f = rep(1:(length(data_final_ystart_fac2fac)/nobs), each = nobs))
data_final_yend_fac2fac_split <- split(data_final_yend_fac2fac, f = rep(1:(length(data_final_ystart_fac2fac)/nobs), each = nobs))
data_final_ystart_fac2fac_bandid_split <- split(data_final_ystart_fac2fac_bandid, f = rep(1:(length(data_final_ystart_fac2fac)/nobs), each = nobs))
data_final_ystart_fac2fac <- unlist(Map(f = function(x, y) {
# order() is used here, potetial problem of dealing with ties, it seems to use a method like rank(ties.method = "first"), which we used before
# maybe we can use a rank based method to be consistent: order(rank(,ties.method = "first"))
# This should not be a problem anyway, since the order now is only for drawing segments
x <- x[order(y)]
},
data_final_ystart_fac2fac_split,
data_final_ystart_fac2fac_bandid_split
))
data_final_yend_fac2fac <- unlist(Map(f = function(x, y) {
# order() is used here, potetial problem of dealing with ties, it seems to use a method like rank(ties.method = "first"), which we used before
# maybe we can use a rank based method to be consistent: order(rank(,ties.method = "first"))
x <- x[order(y)]
},
data_final_yend_fac2fac_split,
data_final_ystart_fac2fac_bandid_split
))
# To adjust the id orders accordingly
# a little different calculation but same idea as above
obs_ids_fac2fac_split <- lapply(data_final_ystart_fac2fac_split, FUN = function(x) obs_ids)
obs_ids_fac2fac <- unlist(Map(f = function(x, y) {
# order() is used here, potetial problem of dealing with ties, it seems to use a method like rank(ties.method = "first"), which we used before
# maybe we can use a rank based method to be consistent: order(rank(,ties.method = "first"))
x <- x[order(y)]
},
obs_ids_fac2fac_split,
data_final_ystart_fac2fac_bandid_split
))
# The part for resort:
if(!is.null(resort)) {
data_break_ystart_split <- split(data_break_ystart, f = rep(1:(length(data_break_ystart)/nobs), each = nobs))
data_break_yend_split <- split(data_break_yend, f = rep(1:(length(data_break_ystart)/nobs), each = nobs))
data_break_yend_bandid_split <- split(data_break_yend_bandid, f = rep(1:(length(data_break_ystart)/nobs), each = nobs))
data_break_ystart <- unlist(Map(f = function(x, y) {
x <- x[order(y)]
},
data_break_ystart_split,
data_break_yend_bandid_split
))
data_break_yend <- unlist(Map(f = function(x, y) {
x <- x[order(y)]
},
data_break_yend_split,
data_break_yend_bandid_split
))
obs_ids_break_split <- lapply(data_break_ystart_split, FUN = function(x) obs_ids)
obs_ids_break <- unlist(Map(f = function(x, y) {
# order() is used here, potetial problem of dealing with ties, it seems to use a method like rank(ties.method = "first"), which we used before
# maybe we can use a rank based method to be consistent: order(rank(,ties.method = "first"))
x <- x[order(y)]
},
obs_ids_break_split,
data_break_yend_bandid_split
))
# Some preparation for later combination of all data, to insert correct ids
n_break <- length(data_break_ystart)
} else {
n_break <- 0
obs_ids_break <- NULL
}
# NEXT: to replace those values in data_boxwidth, because of the segment part, we have to make this detour now
# following calculation relies on the order we created data_final
# number of rows in fac2fac parts
n_fac2fac <- length(data_final_ystart_fac2fac)
data_boxwidth$y[(nrow(data_boxwidth) - n_fac2fac + 1):nrow(data_boxwidth)] <- data_final_ystart_fac2fac
data_boxwidth$yend[(nrow(data_boxwidth) - n_fac2fac + 1):nrow(data_boxwidth)] <- data_final_yend_fac2fac
} else {
# number of rows in fac2fac parts
n_fac2fac <- 0
obs_ids_fac2fac <- NULL
# for resort
n_break <- 0
obs_ids_break <- NULL
}
# To combine main part, resort, ordinary segments in boxes
if (!is.null(resort)) {
data_boxwidth <- data.frame(x = c(data_break_xstart, data_segment_xstart, data_boxwidth$x),
y = c(data_break_ystart, data_segment_ystart, data_boxwidth$y),
xend = c(data_break_xend, data_segment_xend, data_boxwidth$xend),
yend = c(data_break_yend, data_segment_yend, data_boxwidth$yend))
} else {
data_boxwidth <- data.frame(x = c(data_segment_xstart, data_boxwidth$x),
y = c(data_segment_ystart, data_boxwidth$y),
xend = c(data_segment_xend, data_boxwidth$xend),
yend = c(data_segment_yend, data_boxwidth$yend))
}
# data_boxwidth$id <- rep(obs_ids, times = nrow(data_boxwidth)/nobs)
# we have put fac2fac section to the very last of the data set, put break in the very beginning
data_boxwidth$id__ <- c(obs_ids_break, rep(obs_ids, times = (nrow(data_boxwidth) - n_fac2fac - n_break)/nobs), obs_ids_fac2fac)
datanames <- setdiff(names(data), c("name", "value", "level", "class", "value_text"))
# don't include the pcp specific variables - those are dealt with
data_boxwidth <- left_join(data_boxwidth, unique(data[,datanames]), by = "id__", suffix=c("",".2"))
# some of the reorder/arrange
if (overplot == "smallfirst") {
output_data <- data_boxwidth %>% group_by(group) %>% mutate(n = n()) %>% arrange(n)
}
if (overplot == "largefirst") {
output_data <- data_boxwidth %>% group_by(group) %>% mutate(n = n()) %>% arrange(desc(n))
}
if (overplot %in% c("original", "hierarchical")) {
output_data <- data_boxwidth
}
# reverse: to make the plot readable from left to right, especially for factor block design
# also adjusted the grids in scales.R and in the "prepare_width_ajustment" (should be adjustment though) part of this file
# also to improve the original output
output_data <- output_data %>% dplyr::filter(x != xend)
# no more needed
# if(reverse == TRUE){
# x_value <- levels(factor(output_data$x))
# xend_value <- levels(factor(output_data$xend))
#
# mirror_x <- factor(output_data$x)
# levels(mirror_x) <- rev(xend_value)
# mirror_x <- as.numeric(as.character(mirror_x))
#
# mirror_xend <- factor(output_data$xend)
# levels(mirror_xend) <- rev(x_value)
# mirror_xend <- as.numeric(as.character(mirror_xend))
#
# output_data$x <- mirror_x
# output_data$xend <- mirror_xend
#
# }
output_data
}
)
# used to identify the type of neighboring classes, return the position of the first one in a pair(like num-num, num-fac)
# for num to num, we use lines
# for num to factor, we use lines
# for factor to num, we use lines
# for factor to factor, we use ribbons
# 0622new: accomodate resort
classify <- function(classpcp, resort) {
classpcp <- as.numeric(classpcp %in% c("numeric", "integer"))
classpcp_diff <- diff(classpcp)
num2fac <- (1:(length(classpcp)-1))[classpcp_diff == -1]
fac2num <- (1:(length(classpcp)-1))[classpcp_diff == 1]
num2num <- (1:(length(classpcp)-1))[classpcp_diff == 0 & classpcp[-length(classpcp)] == TRUE]
fac2fac <- (1:(length(classpcp)-1))[classpcp_diff == 0 & classpcp[-length(classpcp)] == FALSE]
# resort come in, need to check resort are suitable
fac2fac <- sort(c(fac2fac, resort))
classification <- list(num2num = num2num,
num2fac = num2fac,
fac2num = fac2num,
fac2fac = fac2fac)
classification
}
# used to assign postion for the factors
# here is a long calculation formula
# defined another function inside this function, make sure they are correctly nested
assign_fac <- function(nlevels_list, nobs, freespace = 0.1) {
# adjustment offset for freespace
eachobs <- (1 - freespace)/nobs
level_range <- lapply(nlevels_list,
FUN = function(x) {
freespace_offset <- rep(freespace/(x$nlevels - 1), times = x$nlevels - 1)
freespace_offset_cum <- cumsum(freespace_offset)
c(0, rep(cumsum(eachobs*x$table), each = 2)[-2*x$nlevels]) +
c(0, 0, rep(freespace_offset_cum, each = 2))
})
# eachobs <- 1/nobs
# assign each level
# level_range <- lapply(nlevels_list,
# FUN = function(x) c(0, rep(cumsum(eachobs*x$table)[-x$nlevels], each = 2) +
# freespace/(x$nlevels-1)/2*rep(c(-1, 1), times = x$nlevels-1), 1))
# assign each obs
obs_position <- Map(f = function(x, y){
obs_position_each <- vector("list", length = length(x)/2)
for (i in 1:(length(x)/2)) {
obs_position_each[[i]] <- seq(from = x[2*i-1] + 0.5*eachobs,
to = x[2*i] - 0.5*eachobs,
length.out = y$table[i])
}
names(obs_position_each) <- names(y$table)
obs_position_each},
level_range, nlevels_list)
return(obs_position)
}
# used to arrange the factor positions properly to match the ystart, the same observation.
# start_position = classification$num2fac + 1, for num2fac. Just to make it a little more general.
# end_position = start_position + 1, for num2fac. For fac2num, it's different, be careful.
# start_position = classification$num2fac + 2, end_position = start_position - 1, for fac2num.
# start_position is the one we need to refer to, end_position is the one we actually adjust.
# make sure the levels of factors are used correctly here(match the data, match the order). How was it decided before?
arrange_fac_by_ystart <- function(data_spread, start_position, end_position, obs_position) {
# Map is usd here to deal with three lists in parallel
arranged_postion <- Map(f = function(y, x, z) {
# lapply uses like x[[i]] to extract sublist, assume Map is the same. And it works
# is it safe to use name here? it should work in my example, and it does work in my example
for (i in 1:length(z)) {
x <- replace(x,
list = which(y == names(z)[i]),
z[[i]][rank(x[y == names(z)[i]], ties.method="first")])
}
x
},
data_spread[ ,end_position, drop = FALSE],
data_spread[ ,start_position, drop = FALSE],
obs_position)
arranged_postion
}
# a function to assign box in each level in each factor
# fac_table is a summary of the exsiting combination of factors
# level_range_2 is calculated as level_range inside assign_fac to get postions of levels
# names_to_group is the names of those factors, for convenience
# 0622new: we don't actually need this
# assign_box <- function(fac_table, level_range_2, nlevels_list_con_fac, names_to_group) {
#
# box_position <- Map(f = function(x, y, z){
# box_proportion <- fac_table %>%
# group_by_(z) %>%
# mutate(proportion = Freq/sum(Freq)) %>%
# ungroup()
# box_position <- list()
# for(i in 1:(y$nlevels)){
# box_proportion_each <- box_proportion[box_proportion[z] == names(y$table)[i],]
# eachlevel <- x[c(2*i-1, 2*i)]
# eachbox <- eachlevel[1] + (eachlevel[2] - eachlevel[1])*cumsum(box_proportion_each$proportion)
# names(eachlevel) <- NULL
# box_position[[i]] <- c(eachlevel[1], eachbox)
# names(box_position)[i] <- names(y$table)[i]
# }
# box_position
# },
# level_range_2, nlevels_list_con_fac, names_to_group)
#
# box_position
# }
# calculate the bandid (all possible combinations of factors) for the data
# assign observations to different band
# continuous_fac is the position of factor block
bandid <- function(data_spread, continuous_fac, nobs, nlevels_list_con_fac) {
aa <- as.data.frame(lapply(data_spread[,continuous_fac + 1, drop = FALSE],
FUN = function(x) as.numeric(x) - 1))
dd <- vector()
for (i in 1:length(continuous_fac)) {
dd[i] <- nlevels_list_con_fac[[i]][[1]]
}
dd
as.matrix(aa)%*%c(1, cumprod(dd)[-length(continuous_fac)]) + 1
}
# used to arrange observation positions within each bandid by ystart
# the previous arrange_fac_by_ystart arrange observation positions with in each level
# to properly adjust the lines to avoid overlap, to match the positions in a factor to the numeric value
# within each level, arrange the positions according to bandid from small to big, which is consistent with box_position
# notice that, to accomadate no numeric variable case, I changed the stat_position to a data vector
# since this function is only used to one start column and one end column(by lapply and Map to generalize)
# start_position indicates the numeric variable in data_spread, to be adjusted by
# end_position indicates the first factor variable in a factor block, or the last one when adjust backward
# obs_position is the return value of assign_fac(nlevels_list_con_fac, nobs)
# names_to_group for convience
# can potentially works for each factor block, but used for one variable each time in process_fac2fac
# Added bandid for observations as part of output
arrange_fac_by_ystart_bandid <- function(data_spread, start_position, end_position,
obs_position, fac_table, names_to_group) {
# actually, we may not need Map, unless when we deal with more than one factor block
# but we do deal with it
arranged_position <- Map(f = function(y, x, z, l) {
x_bandid <- x
# i for each level
for (i in 1:length(z)) {
aa <- fac_table[fac_table[l]==names(z[i]), ]
aa$position_in_level <- cumsum(aa$Freq)
bb <- aa$position_in_level
# j for each box(smallest band)
for (j in 1:length(bb)) {
position_in_box <- z[[i]][(bb[j]-aa$Freq[j] + 1):bb[j]]
x <- replace(x,
list = which(data_spread$bandid == aa$bandid[j]),
values = position_in_box[rank(x[data_spread$bandid == aa$bandid[j]], ties.method="first")]
)
# 0622new: A new added part to label the bandid in the process
x_bandid <- replace(x_bandid,
list = which(data_spread$bandid == aa$bandid[j]),
values = aa$bandid[j]
)
}
}
output <- list(x = x, x_bandid = x_bandid)
output
},
# notice that, to accomadate no numeric variable case, I changed the stat_position to a data vector
data_spread[, end_position, drop = FALSE],
as.data.frame(start_position),
obs_position,
names_to_group)
arranged_position
}
# summarize the steps for one factor block into a function
# input: continuous_fac, data_spread and bywhich, freespace, nobs