Skip to content

Commit

Permalink
new figures for BinSeg
Browse files Browse the repository at this point in the history
  • Loading branch information
tdhock committed Sep 2, 2020
1 parent ec4c1de commit fc54eec
Show file tree
Hide file tree
Showing 15 changed files with 1,695 additions and 1,009 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Rplots.pdf
auto
*.log
figure-label-errors-data
figure-label-errors-data-binseg
figure-sequence-cv-data
*~
library
Expand Down
2 changes: 1 addition & 1 deletion README.org
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Figures for Labeled Optimal PARTitioning paper, [[https://arxiv.org/abs/2006.139
- Figure 2: timings. [[file:figure-timings.R][R script]], tex for inclusion in paper: [[file:figure-timings-labels.tex][time vs
number of labels (left)]], [[file:figure-timings.tex][time vs number of data (right)]].
- Figure 3: Best case label error. [[file:figure-label-errors.R][R script]], [[file:figure-label-errors.pdf][OPART pdf (left)]],
[[file:figure-label-errors-SegAnnot.pdf][SegAnnot pdf (right)]].
[[file:figure-label-errors-SegAnnot.pdf][SegAnnot pdf (right)]], [[file:figure-label-errors-BinSeg.pdf][BinSeg pdf]].
- Comparison with BIC figures. [[file:figure-cv-BIC.R][R script]]:
- Figure 4: [[file:figure-cv-BIC.pdf][Error rates]].
- Figure 5: [[file:figure-cv-BIC-roc.pdf][ROC curves]].
Expand Down
Binary file modified figure-cv-BIC-roc.pdf
Binary file not shown.
23 changes: 17 additions & 6 deletions figure-cv-BIC.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
source("packages.R")

common.names <- c(
"test.fold", "penalty", "set", "sequenceID", "count", "cache.csv",
"model.name", "penalty", "possible.fp", "fp", "possible.fn",
"fn", "labels", "errors")
err.dt <- data.table(
csv=Sys.glob("figure-label-errors-data/*.csv")
)[, data.table::fread(
csv,
colClasses=list(character=5)
), by=csv]
csv=Sys.glob("figure-label-errors-data*/*.csv")
)[, {
name.vec <- names(data.table::fread(csv, nrow=0))
seq.i <- which(name.vec=="sequenceID")
data.table::fread(
csv,
colClasses=list(character=seq.i)
)[, common.names, with=FALSE]
}, by=csv]
err.dt[model.name=="LOPART" & set=="train", table(errors)]
err.dt[model.name=="LOPART" & set=="train" & 0<errors, .(
csv, test.fold, set, penalty, fp, fn)]
Expand Down Expand Up @@ -99,14 +107,16 @@ algo.colors <- c(
OPART="#0077CC",
LOPART="black")
algo.colors <- c(
SegAnnot="blue",
BinSeg="orange",
OPART="deepskyblue",
LOPART="black",
SegAnnot="blue",
FPOP="red")
gg <- ggplot()+
theme_bw()+
scale_color_manual(values=algo.colors)+
scale_size_manual(values=c(
BinSeg=1.25,
LOPART=1.5,
OPART=1))+
directlabels::geom_dl(aes(
Expand All @@ -127,6 +137,7 @@ gg <- ggplot()+
color=model.name,
size=model.name,
group=paste(model.name, test.fold)),
alpha=0.7,
data=roc.dt)+
geom_point(aes(
FPR, TPR,
Expand Down
Binary file modified figure-cv-BIC.pdf
Binary file not shown.
Binary file added figure-label-errors-BinSeg.pdf
Binary file not shown.
70 changes: 70 additions & 0 deletions figure-label-errors-data-binseg.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
source("packages.R")

labeled.data <- list()
sizes.list <- list()
for(data.type in c("labels", "signals")){
csv.gz <- sprintf("data-for-LOPART-%s.csv.gz", data.type)
type.dt <- data.table::fread(csv.gz)
labeled.data[[data.type]] <- type.dt
sizes.list[[data.type]] <- type.dt[, data.table(
data.type,
count=.N
), by=sequenceID][order(count)]
}
sizes <- do.call(rbind, sizes.list)
sizes[, .(range=range(count)), by=data.type]

seq.dt <- sizes[data.type=="signals", .(sequenceID, count)]
seq.dt[, cache.csv := file.path(
"figure-label-errors-data-binseg", paste0(sequenceID, ".csv"))]
seq.todo <- seq.dt[!file.exists(cache.csv)]
for(sequenceID.i in seq_along(seq.todo$sequenceID)){
row.todo <- seq.todo[sequenceID.i]
cat(sprintf("%4d / %4d %s\n", sequenceID.i, nrow(seq.todo), row.todo$sequenceID))
data.list <- list()
for(data.type in names(labeled.data)){
data.list[[data.type]] <- labeled.data[[data.type]][row.todo, on="sequenceID"]
}
computed.err <- data.table(test.fold=unique(data.list$labels$fold))[, {
fold.regions <- data.table(data.list$labels)
fold.regions[, set := ifelse(fold==test.fold, "test", "train")]
fold.regions[, annotation := ifelse(
changes==0, "0breakpoints", "1breakpoint")]
train.label.dt <- fold.regions[set=="train"]
fit.dt <- binsegRcpp::binseg_normal(
data.list[["signals"]][["logratio"]])
data.table(penalty=10^seq(-5, 5, by=0.5))[, {
end <- fit.dt[, end[1:which.min(loss+segments*penalty)] ]
meta.dt <- data.table(row.todo, model.name="BinSeg", penalty)
change.dt <- data.table(change=end[-1]+0.5)
fold.regions[, {
err.list <- penaltyLearning::labelError(
models=meta.dt,
labels=data.table(meta.dt, .SD),
changes=data.table(meta.dt, change.dt),
change.var = "change",
label.vars = c("start", "end"),
problem.vars = "sequenceID",
model.vars = c("model.name", "penalty"))
err.list[["model.errors"]]
}, by=set]
}, by=penalty]
}, by=test.fold]
if(FALSE){
ggplot()+
geom_line(aes(
penalty, errors, color=model.name),
data = computed.err[, .(penalty, errors, set, test.fold, model.name)])+
theme_bw()+
theme(panel.spacing=grid::unit(0, "lines"))+
facet_grid(set + test.fold ~ .)+
scale_x_log10()
}
computed.err[set=="test", .(
min=min(errors),
penalties=.N
), by=.(test.fold, model.name)]
dir.create(dirname(row.todo$cache.csv), showWarnings = FALSE, recursive = TRUE)
data.table::fwrite(computed.err, row.todo$cache.csv)
}

57 changes: 47 additions & 10 deletions figure-label-errors.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,22 @@ algo.colors <- c(
algo.colors <- c(
OPART="deepskyblue",
LOPART="black",
BinSeg="orange",
SegAnnot="blue")
common.names <- c(
"test.fold", "penalty", "set", "sequenceID", "count", "cache.csv",
"model.name", "penalty", "possible.fp", "fp", "possible.fn",
"fn", "labels", "errors")
err.dt <- data.table(
csv=Sys.glob("figure-label-errors-data/*.csv")
)[, data.table::fread(
csv,
colClasses=list(character=5)
), by=csv]
csv=Sys.glob("figure-label-errors-data*/*.csv")
)[, {
name.vec <- names(data.table::fread(csv, nrow=0))
seq.i <- which(name.vec=="sequenceID")
data.table::fread(
csv,
colClasses=list(character=seq.i)
)[, common.names, with=FALSE]
}, by=csv]
err.dt[model.name=="LOPART" & set=="train", table(errors)]
err.dt[model.name=="LOPART" & set=="train" & 0<errors, .(
csv, test.fold, set, penalty, fp, fn)]
Expand Down Expand Up @@ -118,12 +127,40 @@ mytab <- function(dt, col.name){
}
mytab(total.min.wide, "train_OPART")

total.min.wide[, test.diff := test_OPART-test_LOPART]
mytab(total.min.wide, "test.diff")
total.min.wide[, test.diff_BinSeg := test_BinSeg-test_LOPART]
train.test.BinSeg <- total.min.wide[, .(
splits=.N
), by=.(train_BinSeg, test.diff_BinSeg)]
gg <- ggplot()+
ggtitle("Best case comparison
with BinSeg")+
geom_hline(yintercept=0, color="grey")+
geom_vline(xintercept=0, color="grey")+
geom_tile(aes(
train_BinSeg, test.diff_BinSeg, fill=log10(splits)),
alpha=0.8,
data=train.test.BinSeg)+
geom_text(aes(
train_BinSeg, test.diff_BinSeg, label=splits),
data=train.test.BinSeg)+
scale.for("BinSeg")+
coord_equal()+
theme_bw()+
scale_x_continuous(
"BinSeg train label errors
(LOPART is always=0)")+
scale_y_continuous(
"Test label error difference
(BinSeg-LOPART)")
pdf("figure-label-errors-BinSeg.pdf", width=3, height=2.3)
print(gg)
dev.off()

total.min.wide[, test.diff_OPART := test_OPART-test_LOPART]
mytab(total.min.wide, "test.diff_OPART")
train.test.counts <- total.min.wide[, .(
splits=.N
), by=.(train_OPART, test.diff)]
), by=.(train_OPART, test.diff_OPART)]
gg <- ggplot()+
##ggtitle("LOPART is more accurate\nthan OPART")+
##my.title+
Expand All @@ -132,11 +169,11 @@ with OPART")+
geom_hline(yintercept=0, color="grey")+
geom_vline(xintercept=0, color="grey")+
geom_tile(aes(
train_OPART, test.diff, fill=log10(splits)),
train_OPART, test.diff_OPART, fill=log10(splits)),
alpha=0.8,
data=train.test.counts)+
geom_text(aes(
train_OPART, test.diff, label=splits),
train_OPART, test.diff_OPART, label=splits),
data=train.test.counts)+
##scale.fill+
scale.for("OPART")+
Expand Down
Loading

0 comments on commit fc54eec

Please sign in to comment.