new figures for BinSeg

tdhock · Sep 2, 2020 · fc54eec · fc54eec
1 parent ec4c1de
commit fc54eec
Show file tree

Hide file tree

Showing 15 changed files with 1,695 additions and 1,009 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ Rplots.pdf
 auto
 *.log
 figure-label-errors-data
+figure-label-errors-data-binseg
 figure-sequence-cv-data
 *~
 library

diff --git a/README.org b/README.org
@@ -5,7 +5,7 @@ Figures for Labeled Optimal PARTitioning paper, [[https://arxiv.org/abs/2006.139
 - Figure 2: timings. [[file:figure-timings.R][R script]], tex for inclusion in paper: [[file:figure-timings-labels.tex][time vs
   number of labels (left)]], [[file:figure-timings.tex][time vs number of data (right)]].
 - Figure 3: Best case label error. [[file:figure-label-errors.R][R script]], [[file:figure-label-errors.pdf][OPART pdf (left)]],
-  [[file:figure-label-errors-SegAnnot.pdf][SegAnnot pdf (right)]].
+  [[file:figure-label-errors-SegAnnot.pdf][SegAnnot pdf (right)]], [[file:figure-label-errors-BinSeg.pdf][BinSeg pdf]].
 - Comparison with BIC figures. [[file:figure-cv-BIC.R][R script]]:
   - Figure 4: [[file:figure-cv-BIC.pdf][Error rates]].
   - Figure 5: [[file:figure-cv-BIC-roc.pdf][ROC curves]].

diff --git a/figure-cv-BIC-roc.pdf b/figure-cv-BIC-roc.pdf
diff --git a/figure-cv-BIC.R b/figure-cv-BIC.R
@@ -1,11 +1,19 @@
 source("packages.R")
 
+common.names <- c(
+  "test.fold", "penalty", "set", "sequenceID", "count", "cache.csv", 
+  "model.name", "penalty", "possible.fp", "fp", "possible.fn", 
+  "fn", "labels", "errors")
 err.dt <- data.table(
-  csv=Sys.glob("figure-label-errors-data/*.csv")
-)[, data.table::fread(
-  csv,
-  colClasses=list(character=5)
-), by=csv]
+  csv=Sys.glob("figure-label-errors-data*/*.csv")
+)[, {
+  name.vec <- names(data.table::fread(csv, nrow=0))
+  seq.i <- which(name.vec=="sequenceID")
+  data.table::fread(
+    csv,
+    colClasses=list(character=seq.i)
+  )[, common.names, with=FALSE]
+}, by=csv]
 err.dt[model.name=="LOPART" & set=="train", table(errors)]
 err.dt[model.name=="LOPART" & set=="train" & 0<errors, .(
   csv, test.fold, set, penalty, fp, fn)]
@@ -99,14 +107,16 @@ algo.colors <- c(
   OPART="#0077CC",
   LOPART="black")
 algo.colors <- c(
+  SegAnnot="blue",
+  BinSeg="orange",
   OPART="deepskyblue",
   LOPART="black",
-  SegAnnot="blue",
   FPOP="red")
 gg <- ggplot()+
   theme_bw()+
   scale_color_manual(values=algo.colors)+
   scale_size_manual(values=c(
+    BinSeg=1.25,
     LOPART=1.5,
     OPART=1))+
   directlabels::geom_dl(aes(
@@ -127,6 +137,7 @@ gg <- ggplot()+
     color=model.name,
     size=model.name,
     group=paste(model.name, test.fold)),
+    alpha=0.7,
     data=roc.dt)+
   geom_point(aes(
     FPR, TPR,

diff --git a/figure-cv-BIC.pdf b/figure-cv-BIC.pdf
diff --git a/figure-label-errors-BinSeg.pdf b/figure-label-errors-BinSeg.pdf
diff --git a/figure-label-errors-data-binseg.R b/figure-label-errors-data-binseg.R
@@ -0,0 +1,70 @@
+source("packages.R")
+
+labeled.data <- list()
+sizes.list <- list()
+for(data.type in c("labels", "signals")){
+  csv.gz <- sprintf("data-for-LOPART-%s.csv.gz", data.type)
+  type.dt <- data.table::fread(csv.gz)
+  labeled.data[[data.type]] <- type.dt
+  sizes.list[[data.type]] <- type.dt[, data.table(
+    data.type,
+    count=.N
+  ), by=sequenceID][order(count)]
+}
+sizes <- do.call(rbind, sizes.list)
+sizes[, .(range=range(count)), by=data.type]
+
+seq.dt <- sizes[data.type=="signals", .(sequenceID, count)]
+seq.dt[, cache.csv := file.path(
+  "figure-label-errors-data-binseg", paste0(sequenceID, ".csv"))]
+seq.todo <- seq.dt[!file.exists(cache.csv)]
+for(sequenceID.i in seq_along(seq.todo$sequenceID)){
+  row.todo <- seq.todo[sequenceID.i]
+  cat(sprintf("%4d / %4d %s\n", sequenceID.i, nrow(seq.todo), row.todo$sequenceID))
+  data.list <- list()
+  for(data.type in names(labeled.data)){
+    data.list[[data.type]] <- labeled.data[[data.type]][row.todo, on="sequenceID"]
+  }
+  computed.err <- data.table(test.fold=unique(data.list$labels$fold))[, {
+    fold.regions <- data.table(data.list$labels)
+    fold.regions[, set := ifelse(fold==test.fold, "test", "train")]
+    fold.regions[, annotation := ifelse(
+      changes==0, "0breakpoints", "1breakpoint")]
+    train.label.dt <- fold.regions[set=="train"]
+    fit.dt <- binsegRcpp::binseg_normal(
+      data.list[["signals"]][["logratio"]])
+    data.table(penalty=10^seq(-5, 5, by=0.5))[, {
+      end <- fit.dt[, end[1:which.min(loss+segments*penalty)] ]
+      meta.dt <- data.table(row.todo, model.name="BinSeg", penalty)
+      change.dt <- data.table(change=end[-1]+0.5)
+      fold.regions[, {
+        err.list <- penaltyLearning::labelError(
+          models=meta.dt,
+          labels=data.table(meta.dt, .SD),
+          changes=data.table(meta.dt, change.dt),
+          change.var = "change",
+          label.vars = c("start", "end"),
+          problem.vars = "sequenceID",
+          model.vars = c("model.name", "penalty"))
+        err.list[["model.errors"]]
+      }, by=set]
+    }, by=penalty]
+  }, by=test.fold]
+  if(FALSE){
+    ggplot()+
+      geom_line(aes(
+        penalty, errors, color=model.name),
+        data = computed.err[, .(penalty, errors, set, test.fold, model.name)])+
+      theme_bw()+
+      theme(panel.spacing=grid::unit(0, "lines"))+
+      facet_grid(set + test.fold ~ .)+
+      scale_x_log10()
+  }
+  computed.err[set=="test", .(
+    min=min(errors),
+    penalties=.N
+  ), by=.(test.fold, model.name)]
+  dir.create(dirname(row.todo$cache.csv), showWarnings = FALSE, recursive = TRUE)
+  data.table::fwrite(computed.err, row.todo$cache.csv)
+}
+
diff --git a/figure-label-errors.R b/figure-label-errors.R
@@ -7,13 +7,22 @@ algo.colors <- c(
 algo.colors <- c(
   OPART="deepskyblue",
   LOPART="black",
+  BinSeg="orange",
   SegAnnot="blue")
+common.names <- c(
+  "test.fold", "penalty", "set", "sequenceID", "count", "cache.csv", 
+  "model.name", "penalty", "possible.fp", "fp", "possible.fn", 
+  "fn", "labels", "errors")
 err.dt <- data.table(
-  csv=Sys.glob("figure-label-errors-data/*.csv")
-)[, data.table::fread(
-  csv,
-  colClasses=list(character=5)
-), by=csv]
+  csv=Sys.glob("figure-label-errors-data*/*.csv")
+)[, {
+  name.vec <- names(data.table::fread(csv, nrow=0))
+  seq.i <- which(name.vec=="sequenceID")
+  data.table::fread(
+    csv,
+    colClasses=list(character=seq.i)
+  )[, common.names, with=FALSE]
+}, by=csv]
 err.dt[model.name=="LOPART" & set=="train", table(errors)]
 err.dt[model.name=="LOPART" & set=="train" & 0<errors, .(
   csv, test.fold, set, penalty, fp, fn)]
@@ -118,12 +127,40 @@ mytab <- function(dt, col.name){
 }
 mytab(total.min.wide, "train_OPART")
 
-total.min.wide[, test.diff := test_OPART-test_LOPART]
-mytab(total.min.wide, "test.diff")
+total.min.wide[, test.diff_BinSeg := test_BinSeg-test_LOPART]
+train.test.BinSeg <- total.min.wide[, .(
+  splits=.N
+), by=.(train_BinSeg, test.diff_BinSeg)]
+gg <- ggplot()+
+  ggtitle("Best case comparison
+with BinSeg")+
+  geom_hline(yintercept=0, color="grey")+
+  geom_vline(xintercept=0, color="grey")+
+  geom_tile(aes(
+    train_BinSeg, test.diff_BinSeg, fill=log10(splits)),
+    alpha=0.8,
+    data=train.test.BinSeg)+
+  geom_text(aes(
+    train_BinSeg, test.diff_BinSeg, label=splits),
+    data=train.test.BinSeg)+
+  scale.for("BinSeg")+
+  coord_equal()+
+  theme_bw()+
+  scale_x_continuous(
+    "BinSeg train label errors
+(LOPART is always=0)")+
+  scale_y_continuous(
+    "Test label error difference
+(BinSeg-LOPART)")
+pdf("figure-label-errors-BinSeg.pdf", width=3, height=2.3)
+print(gg)
+dev.off()
 
+total.min.wide[, test.diff_OPART := test_OPART-test_LOPART]
+mytab(total.min.wide, "test.diff_OPART")
 train.test.counts <- total.min.wide[, .(
   splits=.N
-), by=.(train_OPART, test.diff)]
+), by=.(train_OPART, test.diff_OPART)]
 gg <- ggplot()+
   ##ggtitle("LOPART is more accurate\nthan OPART")+
   ##my.title+
@@ -132,11 +169,11 @@ with OPART")+
   geom_hline(yintercept=0, color="grey")+
   geom_vline(xintercept=0, color="grey")+
   geom_tile(aes(
-    train_OPART, test.diff, fill=log10(splits)),
+    train_OPART, test.diff_OPART, fill=log10(splits)),
     alpha=0.8,
     data=train.test.counts)+
   geom_text(aes(
-    train_OPART, test.diff, label=splits),
+    train_OPART, test.diff_OPART, label=splits),
     data=train.test.counts)+
   ##scale.fill+
   scale.for("OPART")+