-
Notifications
You must be signed in to change notification settings - Fork 3
/
figure-linear-model.R
130 lines (117 loc) · 3.69 KB
/
figure-linear-model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
library(data.table)
library(ggplot2)
data.dir <- "../neuroblastoma-data/data/ATAC_JV_adipose"
data.list <- list()
for(f in c("inputs", "outputs", "evaluation")){
f.csv.xz <- file.path(data.dir, paste0(f, ".csv.xz"))
if(file.exists(f.csv.xz)){
system(paste("unxz", f.csv.xz))
}
f.csv <- file.path(data.dir, paste0(f, ".csv"))
f.dt <- data.table::fread(f.csv)
data.list[[f]] <- f.dt
}
folds.csv <- Sys.glob(file.path(data.dir, "cv", "*", "folds.csv"))[1]
folds.dt <- data.table::fread(folds.csv)
validation.fold <- 1
validation.ids <- folds.dt[fold==validation.fold, sequenceID]
X.all <- scale(data.list$inputs[, -1])
rownames(X.all) <- data.list$inputs$sequenceID
X.finite <- X.all[, apply(is.finite(X.all), 2, all)]
set.list <- list(
validation=rownames(X.finite) %in% validation.ids)
set.list$train <- !set.list$validation
X.list <- lapply(set.list, function(i)X.finite[i, ])
## initialization to origin.
weight.vec <- rep(0, ncol(X.finite)+1)
y.train <- data.list[["outputs"]][
!sequenceID %in% validation.ids,
cbind(min.log.lambda, max.log.lambda)]
set.seed(1)
fit <- penaltyLearning::IntervalRegressionCV(
X.list[["train"]],
y.train)
## weight
pmat <- fit[["param.mat"]]
weight.vec <- pmat[-1]
intercept <- pmat[1]
computeAUM <- function(w, i, is.set){
pred.pen.vec <- (X.finite %*% w) + i
pred.dt <- data.table(
sequenceID=rownames(pred.pen.vec),
pred.log.lambda=as.numeric(pred.pen.vec))
set.dt <- pred.dt[is.set]
penaltyLearning::ROChange(
data.list$evaluation, set.dt, "sequenceID")
}
iteration.dt.list <- list()
for(iteration in 1:1000){
if(! iteration %in% names(iteration.dt.list)){
summary.dt.list <- list()
set.roc.list <- list()
for(set in names(set.list)){
set.roc.list[[set]] <- computeAUM(weight.vec, intercept, set.list[[set]])
summary.dt.list[[set]] <- with(set.roc.list[[set]], data.table(
set,
thresholds,
auc,
aum))
}
summary.dt <- do.call(rbind, summary.dt.list)
iteration.dt.list[[paste(iteration)]] <- data.table(
iteration, summary.dt)
print(iteration)
g.dt <- set.roc.list[["train"]][["aum.grad"]]
not.same <- g.dt[lo != hi]
if(0 < nrow(not.same)){
print(not.same)
stop("not equal")
}
g.vec <- g.dt$lo
direction.vec <- -t(X.list[["train"]]) %*% g.vec
take.step <- function(s){
weight.vec + s*direction.vec
}
set.aum.list <- list()
for(step.size in 10^seq(-10, 0, by=0.5)){
new.weight.vec <- take.step(step.size)
for(set in "train"){
set.roc <- computeAUM(new.weight.vec, 0, set.list[[set]])
set.aum.list[[paste(step.size, set)]] <- data.table(
step.size, set, aum=set.roc$aum,
intercept=set.roc$thresholds[
threshold=="min.error", (max.thresh+min.thresh)/2])
}
}
set.aum <- do.call(rbind, set.aum.list)
best.dt <- set.aum[, .SD[min(aum)==aum], by=set]
ggplot()+
geom_line(aes(
step.size, aum),
data=set.aum)+
geom_point(aes(
step.size, aum),
data=best.dt)+
geom_text(aes(
step.size, aum, label=aum),
vjust=0,
data=best.dt)+
scale_x_log10()+
scale_y_log10()+
facet_grid(set ~ ., scales="free")
weight.vec <- take.step(best.dt[["step.size"]])
intercept <- best.dt[["intercept"]]
}
}
iteration.dt <- do.call(rbind, iteration.dt.list)
iteration.dt[set=="train", .(iteration, threshold, aum, errors)]
ggplot()+
geom_line(aes(
iteration, aum),
data=iteration.dt[threshold=="predicted"])+
facet_grid(set ~ ., scales="free")
ggplot()+
geom_line(aes(
iteration, errors),
data=iteration.dt[threshold=="predicted"])+
facet_grid(set ~ ., scales="free")