diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3.R
index ae6be3a..974bce2 100644
--- a/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3.R
+++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3.R
@@ -1,14 +1,10 @@
 code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
 offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")
-
-# Pre-process data
 source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R"))
-packs <- c("bigstatsr", "future", "furrr", "Matrix", "rhdf5", "stringr", "openxlsx", "katsevich2020", "magrittr")
-for (pack in packs) suppressPackageStartupMessages(library(pack, character.only = TRUE))
+library(ondisc)
+library(rhdf5)
+library(openxlsx)
 
-###################
-# Expression matrix
-###################
 # First, we determine the number of cells and number of genes across all the batches
 h5_files <- paste0(raw_data_dir, "/", grep(pattern = '*.h5', list.files(raw_data_dir), value = TRUE))
 dims_across_h5s <- sapply(h5_files, function(h5_file) {
@@ -22,71 +18,27 @@ row.names(dims_across_h5s) <- NULL
 all(dims_across_h5s[,1] == dims_across_h5s[1,1]) # Verify n genes consistent across files
 n_cells_total <- sum(dims_across_h5s[,2])
 n_genes_total <- dims_across_h5s[1,1]
+batch_lane <- stringr::str_extract(string = h5_files, pattern = "Batch-[0-9]_[0-9]") %>%
+  gsub(pattern = "Batch-", replacement = "")
+
+# Next, initialize an ondisc matrix storing the gene expression data
+if (!dir.exists(paste0(processed_dir, "/odm"))) dir.create(paste0(processed_dir, "/odm"))
+h5_odm <- create_ondisc_matrix_from_h5_list(h5_list = h5_files,
+                                            odm_fp = paste0(processed_dir, "/odm/gene_expression_matrix"),
+                                            metadata_fp = paste0(processed_dir, "/odm/gene_expression_metadata.rds"),
+                                            barcode_suffixes = batch_lane,
+                                            progress = TRUE)
 
-# We will use only the protein-coding genes in this analysis; load them.
+# modify the ondisc matrix by identifying the protein-coding genes
 gene_df <- read.xlsx(xlsxFile = paste0(raw_data_dir, "/Genes.xlsx"), sheet = 1)
 all_protein_coding_genes <- gene_df$Gene_Symbol
 rm(gene_df)
-h5_file <- h5_files[1]
-h5_handle <- H5Fopen(h5_file)
-all_sequenced_genes <- h5_handle$"/refgenome_hg38_CROP-Guide-MS2-2.1.0/gene_names"
-all_sequenced_genes_ids <-  h5_handle$"/refgenome_hg38_CROP-Guide-MS2-2.1.0/genes"
-saveRDS(all_sequenced_genes_ids, file = paste0(processed_dir, '/all_sequenced_genes_id.rds'))
-
-h5_gene_info <- data.frame(row_idx = seq(1, n_genes_total), gene_name = all_sequenced_genes, gene_id = all_sequenced_genes_ids)
-h5_gene_info_protein_coding <- dplyr::filter(h5_gene_info,
-                                             gene_name %in% all_protein_coding_genes)
-genes_in_use <- unique(h5_gene_info_protein_coding$gene_name)
-genes_in_use_ids <- h5_gene_info_protein_coding$gene_id
-gene_idxs_in_use <- h5_gene_info_protein_coding$row_idx
-n_genes_in_use <- length(genes_in_use_ids)
-H5Fclose(h5_handle)
-
-# Next, we create a file-backed matrix to store the transpose of the expression matrix
-exp_mat_t <- FBM(nrow = n_genes_in_use, ncol = n_cells_total, type = "unsigned short", init = 0, backingfile = paste0(processed_dir, "/expression_matrix_t"), create_bk = TRUE)
-exp_mat_t_metadata <- list(nrow = n_genes_in_use, ncol = n_cells_total, type = "unsigned short", backingfile = paste0(processed_dir, "/expression_matrix_t"))
-saveRDS(object = exp_mat_t_metadata, file = paste0(processed_dir, "/exp_mat_t_metadata.rds"))
-
-exp_mat <- FBM(nrow = n_cells_total, ncol = n_genes_in_use, type = "unsigned short", init = 0, backingfile = paste0(processed_dir, "/expression_matrix"), create_bk = TRUE)
-exp_mat_metadata <- list(nrow = n_cells_total, ncol = n_genes_in_use, type = "unsigned short", backingfile = paste0(processed_dir, "/expression_matrix"))
-saveRDS(object = exp_mat_metadata, file = paste0(processed_dir, "/exp_mat_metadata.rds"))
-
-# We iterate through the hd5 files and write the column chunks piece-by-piece to the FBM.
-n_cells_processed <- 0
-ordered_cell_barcodes <- character(0)
-
-batch_lane <- stringr::str_extract(string = h5_files, pattern = "Batch-[0-9]_[0-9]") %>%
-  gsub(pattern = "Batch-", replacement = "")
-for (i in seq(1, length(h5_files))) {
-  print(i)
-  curr_batch_lane <- batch_lane[i]
-  h5_file <- h5_files[i]
-  print(paste("Working on", h5_file))
-  h5_handle <- H5Fopen(h5_file)
-  dat <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/data"
-  indices <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/indices"
-  ind_ptr <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/indptr"
-  shape <-  h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/shape"
-  n_cols <- shape[2]
-  cell_barcodes <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/barcodes"
-  # do some barcode processing
-  if (any(duplicated(cell_barcodes))) stop("Duplicate barcodes")
-  cell_barcodes <- paste0(cell_barcodes, "_", curr_batch_lane)
-  # Ensure that the maximum index pointer is less than the maximum integer value in R
-  if (max(ind_ptr) >= 2147483647) stop("Index pointer exceeds R max integer value.")
-  sparseM <- Matrix::sparseMatrix(i = indices, p = ind_ptr, x = dat, dims = shape)
-  sparseM <- sparseM[gene_idxs_in_use,]
-  # write
-  exp_mat_t[seq(1, n_genes_in_use), (n_cells_processed + 1):(n_cells_processed + n_cols)] <- as.matrix(sparseM)
-  exp_mat[(n_cells_processed + 1):(n_cells_processed + n_cols), seq(1, n_genes_in_use)] <- as.matrix(t(sparseM))
-  H5Fclose(h5_handle)
-  n_cells_processed <- n_cells_processed + n_cols
-  ordered_cell_barcodes <- c(ordered_cell_barcodes, cell_barcodes)
-}
-
-saveRDS(object = ordered_cell_barcodes, file = paste0(processed_dir, "/cell_barcodes_gene.rds"))
-saveRDS(object = genes_in_use, file = paste0(processed_dir, "/ordered_genes.RDS"))
-saveRDS(object = genes_in_use_ids, file = paste0(processed_dir, "/ordered_gene_ids.RDS"))
+is_protein_coding <- get_feature_names(h5_odm) %in% all_protein_coding_genes
+h5_odm_w_protein_coding <- h5_odm %>% mutate_feature_covariates(protein_coding = is_protein_coding)
+save_odm(h5_odm_w_protein_coding, paste0(processed_dir, "/odm/metadata_plus.rds"))
+all_gene_ids <- rhdf5::h5read(file = h5_files[1], name = "/refgenome_hg38_CROP-Guide-MS2-2.1.0/genes")
+saveRDS(object = all_gene_ids,
+        file = paste0(processed_dir, '/all_sequenced_genes_id.rds'))
 
 ###############
 # gRNA UMI data
diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3_old.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3_old.R
new file mode 100644
index 0000000..ae6be3a
--- /dev/null
+++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3_old.R
@@ -0,0 +1,242 @@
+code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
+offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")
+
+# Pre-process data
+source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R"))
+packs <- c("bigstatsr", "future", "furrr", "Matrix", "rhdf5", "stringr", "openxlsx", "katsevich2020", "magrittr")
+for (pack in packs) suppressPackageStartupMessages(library(pack, character.only = TRUE))
+
+###################
+# Expression matrix
+###################
+# First, we determine the number of cells and number of genes across all the batches
+h5_files <- paste0(raw_data_dir, "/", grep(pattern = '*.h5', list.files(raw_data_dir), value = TRUE))
+dims_across_h5s <- sapply(h5_files, function(h5_file) {
+  h5_handle <- H5Fopen(h5_file)
+  dim <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/shape"
+  out <- as.integer(dim)
+  H5Fclose(h5_handle)
+  return(out)
+}) %>% t()
+row.names(dims_across_h5s) <- NULL
+all(dims_across_h5s[,1] == dims_across_h5s[1,1]) # Verify n genes consistent across files
+n_cells_total <- sum(dims_across_h5s[,2])
+n_genes_total <- dims_across_h5s[1,1]
+
+# We will use only the protein-coding genes in this analysis; load them.
+gene_df <- read.xlsx(xlsxFile = paste0(raw_data_dir, "/Genes.xlsx"), sheet = 1)
+all_protein_coding_genes <- gene_df$Gene_Symbol
+rm(gene_df)
+h5_file <- h5_files[1]
+h5_handle <- H5Fopen(h5_file)
+all_sequenced_genes <- h5_handle$"/refgenome_hg38_CROP-Guide-MS2-2.1.0/gene_names"
+all_sequenced_genes_ids <-  h5_handle$"/refgenome_hg38_CROP-Guide-MS2-2.1.0/genes"
+saveRDS(all_sequenced_genes_ids, file = paste0(processed_dir, '/all_sequenced_genes_id.rds'))
+
+h5_gene_info <- data.frame(row_idx = seq(1, n_genes_total), gene_name = all_sequenced_genes, gene_id = all_sequenced_genes_ids)
+h5_gene_info_protein_coding <- dplyr::filter(h5_gene_info,
+                                             gene_name %in% all_protein_coding_genes)
+genes_in_use <- unique(h5_gene_info_protein_coding$gene_name)
+genes_in_use_ids <- h5_gene_info_protein_coding$gene_id
+gene_idxs_in_use <- h5_gene_info_protein_coding$row_idx
+n_genes_in_use <- length(genes_in_use_ids)
+H5Fclose(h5_handle)
+
+# Next, we create a file-backed matrix to store the transpose of the expression matrix
+exp_mat_t <- FBM(nrow = n_genes_in_use, ncol = n_cells_total, type = "unsigned short", init = 0, backingfile = paste0(processed_dir, "/expression_matrix_t"), create_bk = TRUE)
+exp_mat_t_metadata <- list(nrow = n_genes_in_use, ncol = n_cells_total, type = "unsigned short", backingfile = paste0(processed_dir, "/expression_matrix_t"))
+saveRDS(object = exp_mat_t_metadata, file = paste0(processed_dir, "/exp_mat_t_metadata.rds"))
+
+exp_mat <- FBM(nrow = n_cells_total, ncol = n_genes_in_use, type = "unsigned short", init = 0, backingfile = paste0(processed_dir, "/expression_matrix"), create_bk = TRUE)
+exp_mat_metadata <- list(nrow = n_cells_total, ncol = n_genes_in_use, type = "unsigned short", backingfile = paste0(processed_dir, "/expression_matrix"))
+saveRDS(object = exp_mat_metadata, file = paste0(processed_dir, "/exp_mat_metadata.rds"))
+
+# We iterate through the hd5 files and write the column chunks piece-by-piece to the FBM.
+n_cells_processed <- 0
+ordered_cell_barcodes <- character(0)
+
+batch_lane <- stringr::str_extract(string = h5_files, pattern = "Batch-[0-9]_[0-9]") %>%
+  gsub(pattern = "Batch-", replacement = "")
+for (i in seq(1, length(h5_files))) {
+  print(i)
+  curr_batch_lane <- batch_lane[i]
+  h5_file <- h5_files[i]
+  print(paste("Working on", h5_file))
+  h5_handle <- H5Fopen(h5_file)
+  dat <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/data"
+  indices <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/indices"
+  ind_ptr <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/indptr"
+  shape <-  h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/shape"
+  n_cols <- shape[2]
+  cell_barcodes <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/barcodes"
+  # do some barcode processing
+  if (any(duplicated(cell_barcodes))) stop("Duplicate barcodes")
+  cell_barcodes <- paste0(cell_barcodes, "_", curr_batch_lane)
+  # Ensure that the maximum index pointer is less than the maximum integer value in R
+  if (max(ind_ptr) >= 2147483647) stop("Index pointer exceeds R max integer value.")
+  sparseM <- Matrix::sparseMatrix(i = indices, p = ind_ptr, x = dat, dims = shape)
+  sparseM <- sparseM[gene_idxs_in_use,]
+  # write
+  exp_mat_t[seq(1, n_genes_in_use), (n_cells_processed + 1):(n_cells_processed + n_cols)] <- as.matrix(sparseM)
+  exp_mat[(n_cells_processed + 1):(n_cells_processed + n_cols), seq(1, n_genes_in_use)] <- as.matrix(t(sparseM))
+  H5Fclose(h5_handle)
+  n_cells_processed <- n_cells_processed + n_cols
+  ordered_cell_barcodes <- c(ordered_cell_barcodes, cell_barcodes)
+}
+
+saveRDS(object = ordered_cell_barcodes, file = paste0(processed_dir, "/cell_barcodes_gene.rds"))
+saveRDS(object = genes_in_use, file = paste0(processed_dir, "/ordered_genes.RDS"))
+saveRDS(object = genes_in_use_ids, file = paste0(processed_dir, "/ordered_gene_ids.RDS"))
+
+###############
+# gRNA UMI data
+###############
+rm(list=ls())
+code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
+offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")
+source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R"))
+
+# Save the bulk region names
+h5_files <- paste0(raw_data_dir, "/", grep(pattern = '*.h5', list.files(raw_data_dir), value = TRUE))
+gene_names <- rhdf5::h5read(h5_files[1], "/refgenome_hg38_CROP-Guide-MS2-2.1.0/gene_names")
+gene_ids <- rhdf5::h5read(h5_files[1], "/refgenome_hg38_CROP-Guide-MS2-2.1.0/genes")
+
+enh_targets_df <- read.xlsx(xlsxFile = paste0(raw_data_dir, "/enh_targets.xlsx"), sheet = 1)
+bulk_region_names <- filter(enh_targets_df, gene_names %in% c("ARL15", "MYB")) %>%
+  dplyr::select(region, region_name = Denoted.Region.Name.used.in.the.paper, targeted_gene = gene_names) %>%
+  filter(region_name %in% c("ARL15-enh", "MYB-enh-3")) %>%
+  dplyr::mutate(targeted_gene_id = c(gene_ids["ARL15" == gene_names],
+                                     gene_ids["MYB" == gene_names]))
+saveRDS(object = bulk_region_names, paste0(processed_dir, "/bulk_region_names.rds"))
+
+
+####################################################
+# Next, we create a data frame containing UMI counts
+####################################################
+guide_seqs <- openxlsx::read.xlsx(xlsxFile = paste0(raw_data_dir, "/all_oligos.xlsx"),
+                                  sheet = 1) %>% dplyr::select(hg38_enh_region = "region.pos.(hg38)",
+                                                               spacer_seq = "spacer.sequence")
+weird_spacers <- names(which(table(guide_seqs$spacer_seq) >= 2))
+guide_seqs <- dplyr::filter(guide_seqs, !(spacer_seq %in% weird_spacers))
+all(table(guide_seqs$spacer_seq) == 1); all(table(guide_seqs$hg38_enh_region) == 10)
+
+# get the raw file names
+raw_fs <- list.files(raw_data_dir)
+gRNA_files <- paste0(raw_data_dir, "/", grep(pattern = "sgRNA-enrichment_5K-sgRNAs_Batch", x = raw_fs, value = TRUE))
+
+# get the batch ID of each file
+batch <- stringr::str_extract(string = gRNA_files, pattern = "Batch_[0-9]_[0-9]") %>%
+  gsub(pattern = "Batch_", replacement = "")
+
+# construct a sparse matrix of gRNA counts for each file
+res <- lapply(X = seq(1L, length(gRNA_files)), FUN = function(i) {
+  curr_file <- gRNA_files[i]
+  curr_batch <- batch[i]
+  print(paste("Working on file", curr_file))
+  curr_gRNA_count_matrix <- readr::read_tsv(file = curr_file,
+                                            col_names = c("cell_barcode", "total_read_count", "total_umi_count", "gRNA_spacer_seqs", "read_counts", "umi_counts"),
+                                            col_types = c("cccccc")) %>% dplyr::select(cell_barcode, gRNA_spacer_seqs, umi_counts, total_umi_count)
+  cell_barcodes <- curr_gRNA_count_matrix$cell_barcode
+  # convert to sparse triplet matrix
+  umi_count_list <- lapply(curr_gRNA_count_matrix$umi_counts, function(v) stringr::str_split(v, pattern = ";") %>% unlist() %>% as.integer())
+  gRNA_spacer_seq_list <- lapply(curr_gRNA_count_matrix$gRNA_spacer_seqs, function(v) stringr::str_split(v, pattern = ";") %>% unlist())
+  n_entries_per_cell <- sapply(X = umi_count_list, function(v) length(v))
+  barcode_vector <- rep(x = cell_barcodes, times = n_entries_per_cell)
+  gRNA_spacer_seq_vector <- unlist(gRNA_spacer_seq_list)
+  umi_count_vector <- unlist(umi_count_list)
+  ch_df <- data.frame(barcode = barcode_vector,
+                      spacer_seq = gRNA_spacer_seq_vector,
+                      umi_count = umi_count_vector)
+  # remove all entries with spacer seqs not in the spacer seq list
+  ch_df <- ch_df %>% dplyr::filter(gRNA_spacer_seq_vector %in% guide_seqs$spacer_seq)
+  ordered_barcode_vector <- sort(unique(ch_df$barcode))
+  
+  spacer_seq_idx <- match(x = ch_df$spacer_seq, guide_seqs$spacer_seq)
+  barcode_idx <- match(x = ch_df$barcode, table = ordered_barcode_vector)
+  
+  m <- Matrix::sparseMatrix(i = spacer_seq_idx,
+                            j = barcode_idx,
+                            x = ch_df$umi_count,
+                            dims = c(length(guide_seqs$spacer_seq),
+                                     length(ordered_barcode_vector)))
+  rownames(m) <- guide_seqs$spacer_seq
+  colnames(m) <- paste0(ordered_barcode_vector, "-1_", curr_batch)
+  # obtain the gRNA UMI counts and cell count
+  curr_gRNA_count_matrix <- as.data.frame(curr_gRNA_count_matrix)
+  rownames(curr_gRNA_count_matrix) <- curr_gRNA_count_matrix$cell_barcode
+  curr_gRNA_count_matrix_sub <- curr_gRNA_count_matrix[ordered_barcode_vector,]
+  umi_counts <- as.integer(curr_gRNA_count_matrix_sub$total_umi_count)
+  n_cells <- length(umi_counts)
+  return(list(m = m, umi_counts = umi_counts, n_cells = n_cells))
+})
+
+combined_m <- do.call(what = cbind, args = lapply(res, function(i) i$m))
+all(Matrix::colSums(combined_m) >= 1); all(Matrix::rowSums(combined_m) >= 1)
+
+# Finally, perform the grouping operation
+gRNA_matrix_raw <- combined_m
+unique_regions <- unique(guide_seqs$hg38_enh_region)
+
+thresholded_ungrouped_mat <- apply(X = gRNA_matrix_raw, MARGIN = 1, FUN = function(r) {
+  r_nonzero <- r[r >= 1]
+  as.integer(r > mean(r_nonzero))
+}) %>% Matrix::t()
+colnames(thresholded_ungrouped_mat) <- colnames(gRNA_matrix_raw)
+
+thresholded_grouped_mat <- sapply(X = unique_regions, FUN = function(curr_region) {
+  print(curr_region)
+  curr_spacer_seqs <- dplyr::filter(guide_seqs, hg38_enh_region == curr_region) %>%
+    dplyr::pull(spacer_seq)
+  curr_m <- thresholded_ungrouped_mat[curr_spacer_seqs,]
+  apply(X = curr_m, MARGIN = 2, FUN = function(col) max(col))
+}) %>% t()
+
+saveRDS(thresholded_grouped_mat, paste0(processed_dir, "/gRNA_indicator_matrix_unordered.rds"))
+
+
+# compute the cell covariate matrix
+cell_gRNA_umi_counts <- lapply(res, function(i) i$umi_counts) %>% do.call(what = "c", args = .)
+cell_counts <- lapply(res, function(i) i$n_cells) %>% unlist()
+cell_covariate_matrix <- data.frame(cell_barcode = colnames(thresholded_grouped_mat),
+                                    cell_gRNA_umi_counts = cell_gRNA_umi_counts,
+                                    batch = rep(batch, times = cell_counts))
+saveRDS(object = cell_covariate_matrix, file = paste0(processed_dir, "/cell_covariate_matrix_gRNA.rds"))
+
+##############
+# Bulk RNA-seq
+##############
+library(openxlsx)
+rm(list=ls())
+code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
+offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")
+source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R"))
+
+gene_df <- read.xlsx(xlsxFile = paste0(raw_data_dir, "/Genes.xlsx"), sheet = 1)
+all_protein_coding_genes <- gene_df$Gene_Symbol
+bulk_info <- read.xlsx(xlsxFile = paste0(raw_data_dir, "/bulk_rna_info.xlsx"), sheet = 3) %>% select(library_name = Library.Name, gRNA = sgRNA, region = Region, biological_duplicate = Biological.Duplicate)
+bulk_info_arl15_enh <- slice(bulk_info, 1:25)
+bulk_info_myb_enh3 <- slice(bulk_info, 26:49)
+
+bulk_df <- suppressWarnings(read_tsv(file = paste0(raw_data_dir, "/GSE129825_Libraries.FeatureCounts.ARL15_enhancer.txt"), col_types = "ccccciiiiiiiiiiiiiiiiiiiiiiiiii")) %>% rename("PZ788" = "PZ778...10", "PZ778" = "PZ778...13")
+bulk_df_arl15_enh <- filter(bulk_df, Geneid %in% all_protein_coding_genes)
+bulk_df <- suppressWarnings(read_tsv(file = paste0(raw_data_dir, "/GSE129825_Libraries.FeatureCounts.MYB_enhancer.txt"), col_types = "ccccciiiiiiiiiiiiiiiiiiiiiiiiii"))
+bulk_df_myb_enh3 <- filter(bulk_df, Geneid %in% all_protein_coding_genes)
+
+bulk_rnaseq <- list(data = list(arl15_enh = bulk_df_arl15_enh, myb_enh3 = bulk_df_myb_enh3), info = list(arl15_enh = bulk_info_arl15_enh, myb_enh3 = bulk_info_myb_enh3))
+saveRDS(object = bulk_rnaseq, file = paste0(processed_dir, "/bulk_RNAseq.rds"))
+
+#################################################
+# Xie hypergeometric p-values for ARL15 and MYB-3
+#################################################
+all_sequenced_genes_ids <- readRDS(file = paste0(processed_dir, '/all_sequenced_genes_id.rds'))
+suppressPackageStartupMessages(library(R.matlab))
+extract_p_vals_hypergeo <- function(p_vals_hypergeo) {
+  p_vals <- exp(p_vals_hypergeo$matrix[1,])
+  names(p_vals) <- all_sequenced_genes_ids
+  return(p_vals)
+}
+
+xie_pfiles <- setNames(paste0(raw_data_dir, c("/hypergeometric_pvals_arl15_down.mat", "/hypergeometric_pvals_myb3_down.mat")), c("arl15_enh", "myb_enh3"))
+xie_pfiles_r <- xie_pfiles %>% map(readMat) %>% map(extract_p_vals_hypergeo)
+
+saveRDS(object = xie_pfiles_r, file = paste0(processed_dir, c("/xie_p_values.rds")))
diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R
index 0f25052..9a082bb 100644
--- a/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R
+++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R
@@ -1,54 +1,53 @@
 code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
 offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")
 source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R"))
+library(ondisc)
 
-#################################################
-# 1. Load gene data; create gene covariate matrix
-#################################################
-barcodes_gene <- readRDS(paste0(processed_dir, "/cell_barcodes_gene.rds"))
-
-# Compute the number of UMIs in each cell
-exp_mat_metadata_t <- readRDS(paste0(processed_dir, "/exp_mat_t_metadata.rds"))
-exp_mat_metadata_t$backingfile <- paste0(processed_dir, "/expression_matrix_t")
-exp_mat_t <- exp_mat_metadata_t %>% sceptre::load_fbm()
-
-n_umis_per_cell <- big_apply(exp_mat_t, function(X, ind) {colSums(X[,ind])}) %>% unlist()
-gene_covariate_matrix <- data.frame(cell_barcode = barcodes_gene, log_n_umis = n_umis_per_cell)
-gene_barcode_original_order <- gene_covariate_matrix$cell_barcode
+##############
+# 1. Gene data
+##############
+odm_dir <- paste0(processed_dir, "/odm")
+odm_fp <- paste0(odm_dir, "/gene_expression_matrix.odm")
+metadata_fp <- paste0(odm_dir, "/metadata_plus.rds")
+gene_odm <- read_odm(odm_fp = odm_fp, metadata_fp = metadata_fp)
 
 # Compute the mean expression of each gene
-gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS"))
-exp_mat_metadata <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds"))
-exp_mat_metadata$backingfile <- paste0(processed_dir, "/expression_matrix")
-exp_mat <- load_fbm(exp_mat_metadata)
-gene_expression_p <- big_apply(exp_mat, function(X, ind) colMeans(X[,ind] >= 1)) %>% unlist()
-highly_expressed_genes <- gene_ids[which(gene_expression_p >= 0.005)] # threshold of 0.005 (1/2%)
+highly_expressed_genes <- get_feature_covariates(gene_odm) %>% dplyr::mutate(gene_id = get_feature_ids(gene_odm),
+                                                   p_expressed = n_nonzero / ncol(gene_odm)) %>%
+  dplyr::filter(p_expressed >= 0.005, protein_coding) %>% dplyr::pull(gene_id)
 
-##########################################
-# Load gRNA data and gRNA covariate matrix
-##########################################
+# Save the highly expressed genes; subset expression matrix according to these genes
+saveRDS(object = highly_expressed_genes, file = paste0(processed_dir, "/highly_expressed_genes.rds"))
+gene_odm <- gene_odm[highly_expressed_genes,] # 10,014 genes, 106,670 cells
+
+##############
+# 2. gRNA data
+##############
 gRNA_indic_matrix <- readRDS(paste0(processed_dir, "/gRNA_indicator_matrix_unordered.rds"))
 gRNA_covariate_matrix <- readRDS(paste0(processed_dir, "/cell_covariate_matrix_gRNA.rds"))
 rownames(gRNA_covariate_matrix) <- gRNA_covariate_matrix$cell_barcode
 
-# Intersect the cells
-cells_intersect <- intersect(x = gene_barcode_original_order, gRNA_covariate_matrix$cell_barcode)
+######################
+# 3. Cell intersection
+######################
+# Intersect the gRNA and gene cells
+cells_intersect <- intersect(x = get_cell_barcodes(gene_odm),
+                             y = colnames(gRNA_indic_matrix))
 
 # subset the gRNA indicator matrix and covariate matrix appropriately
 gRNA_indic_matrix_sub <- as.data.frame(gRNA_indic_matrix[,cells_intersect] %>% t())
 gRNA_covariate_matrix_sub <- gRNA_covariate_matrix[cells_intersect,]
 
-# get the cell subset ordering
-cell_subset <- match(cells_intersect, gene_barcode_original_order)
-
-################################
-# get the global covariate matrix
-################################
-global_covariate_matrix <- data.frame(log_n_umis = log(n_umis_per_cell[cell_subset]),
+# subset the gene matrix
+gene_odm <- gene_odm[,cells_intersect]
+# obtain the global cell covariate matrix
+global_covariate_matrix <- data.frame(log_n_umis = get_cell_covariates(gene_odm) %>% dplyr::pull(n_umis) %>% log(),
                                       log_n_gRNA_umis = log(gRNA_covariate_matrix_sub$cell_gRNA_umi_counts),
                                       batch = gRNA_covariate_matrix_sub$batch)
 
-# save
+##################################################################################
+# Create analysis-ready dir; save highly expressed genes and gRNA indicator matrix
+##################################################################################
 # create a new directory, "analysis_ready" to store all data in analysis-ready format.
 analysis_ready_dir <- paste0(offsite_dir, "data/xie/analysis_ready")
 if (!dir.exists(analysis_ready_dir)) dir.create(path = analysis_ready_dir, recursive = TRUE)
@@ -56,14 +55,12 @@ if (!dir.exists(analysis_ready_dir)) dir.create(path = analysis_ready_dir, recur
 fst::write_fst(x = global_covariate_matrix,
                path = paste0(analysis_ready_dir, "/covariate_model_matrix.fst"))
 saveRDS(object = highly_expressed_genes, file = paste0(processed_dir, "/highly_expressed_genes.rds"))
-gRNA_indic_mat <- fst::write_fst(gRNA_indic_matrix_sub, paste0(analysis_ready_dir, "/gRNA_indicator_matrix.fst"))
+fst::write_fst(gRNA_indic_matrix_sub, paste0(analysis_ready_dir, "/gRNA_indicator_matrix.fst"))
 
-###############################################
-# save subsetted cell-by-gene expression matrix
-###############################################
-gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS"))
-exp_mat_mem <- exp_mat[,seq(1, ncol(exp_mat))]
-exp_mat_sub <- exp_mat_mem[cell_subset,]
+######################################################################################
+# save subsetted cell-by-gene expression matrix (only highly expressed genes included)
+######################################################################################
+exp_mat_sub <- as.matrix(gene_odm[[,1:ncol(gene_odm)]])
 exp_mat_sub_disk <- FBM(nrow = nrow(exp_mat_sub),
                         ncol = ncol(exp_mat_sub),
                         type = "unsigned short",
diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4_old.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4_old.R
new file mode 100644
index 0000000..4e041c4
--- /dev/null
+++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4_old.R
@@ -0,0 +1,78 @@
+code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
+offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")
+source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R"))
+
+#################################################
+# 1. Load gene data; create gene covariate matrix
+#################################################
+barcodes_gene <- readRDS(paste0(processed_dir, "/cell_barcodes_gene.rds"))
+
+# Compute the number of UMIs in each cell
+exp_mat_metadata_t <- readRDS(paste0(processed_dir, "/exp_mat_t_metadata.rds"))
+exp_mat_metadata_t$backingfile <- paste0(processed_dir, "/expression_matrix_t")
+exp_mat_t <- exp_mat_metadata_t %>% sceptre::load_fbm()
+
+n_umis_per_cell <- big_apply(exp_mat_t, function(X, ind) {colSums(X[,ind])}) %>% unlist()
+gene_covariate_matrix <- data.frame(cell_barcode = barcodes_gene, log_n_umis = n_umis_per_cell)
+gene_barcode_original_order <- gene_covariate_matrix$cell_barcode
+
+# Compute the mean expression of each gene
+gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS"))
+exp_mat_metadata <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds"))
+exp_mat_metadata$backingfile <- paste0(processed_dir, "/expression_matrix")
+exp_mat <- load_fbm(exp_mat_metadata)
+gene_expression_p <- big_apply(exp_mat, function(X, ind) colMeans(X[,ind] >= 1)) %>% unlist()
+highly_expressed_genes <- gene_ids[which(gene_expression_p >= 0.005)] # threshold of 0.005 (1/2%)
+
+##########################################
+# Load gRNA data and gRNA covariate matrix
+##########################################
+gRNA_indic_matrix <- readRDS(paste0(processed_dir, "/gRNA_indicator_matrix_unordered.rds"))
+gRNA_covariate_matrix <- readRDS(paste0(processed_dir, "/cell_covariate_matrix_gRNA.rds"))
+rownames(gRNA_covariate_matrix) <- gRNA_covariate_matrix$cell_barcode
+
+# Intersect the cells
+cells_intersect <- intersect(x = gene_barcode_original_order, gRNA_covariate_matrix$cell_barcode)
+
+# subset the gRNA indicator matrix and covariate matrix appropriately
+gRNA_indic_matrix_sub <- as.data.frame(gRNA_indic_matrix[,cells_intersect] %>% t())
+gRNA_covariate_matrix_sub <- gRNA_covariate_matrix[cells_intersect,]
+
+# get the cell subset ordering
+cell_subset <- match(cells_intersect, gene_barcode_original_order)
+
+################################
+# get the global covariate matrix
+################################
+global_covariate_matrix <- data.frame(log_n_umis = log(n_umis_per_cell[cell_subset]),
+                                      log_n_gRNA_umis = log(gRNA_covariate_matrix_sub$cell_gRNA_umi_counts),
+                                      batch = gRNA_covariate_matrix_sub$batch)
+
+# save
+# create a new directory, "analysis_ready" to store all data in analysis-ready format.
+analysis_ready_dir <- paste0(offsite_dir, "data/xie/analysis_ready")
+if (!dir.exists(analysis_ready_dir)) dir.create(path = analysis_ready_dir, recursive = TRUE)
+
+fst::write_fst(x = global_covariate_matrix,
+               path = paste0(analysis_ready_dir, "/covariate_model_matrix.fst"))
+saveRDS(object = highly_expressed_genes, file = paste0(processed_dir, "/highly_expressed_genes.rds"))
+gRNA_indic_mat <- fst::write_fst(gRNA_indic_matrix_sub, paste0(analysis_ready_dir, "/gRNA_indicator_matrix.fst"))
+
+######################################################################################
+# save subsetted cell-by-gene expression matrix (only highly expressed genes included)
+######################################################################################
+gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS"))
+exp_mat_mem <- exp_mat[,seq(1, ncol(exp_mat))]
+exp_mat_sub <- exp_mat_mem[cell_subset,]
+exp_mat_sub_disk <- FBM(nrow = nrow(exp_mat_sub),
+                        ncol = ncol(exp_mat_sub),
+                        type = "unsigned short",
+                        init = 0,
+                        backingfile = paste0(analysis_ready_dir, "/expression_matrix_sub"),
+                        create_bk = TRUE)
+exp_mat_sub_disk[1:nrow(exp_mat_sub), 1:ncol(exp_mat_sub)] <- exp_mat_sub
+exp_mat_sub_metadata <- list(nrow = nrow(exp_mat_sub),
+                             ncol = ncol(exp_mat_sub),
+                             type = "unsigned short",
+                             backingfile = paste0(analysis_ready_dir, "/expression_matrix_sub"))
+saveRDS(object = exp_mat_sub_metadata, paste0(analysis_ready_dir, "/exp_mat_sub_metadata.rds"))
diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R
index 0eff249..3e2c4da 100644
--- a/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R
+++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R
@@ -13,7 +13,7 @@ library(biomaRt)
 #####################
 # 1. gene positions
 #####################
-gene.id <- readRDS(paste0(processed_dir, '/highly_expressed_genes.rds'))   # 5129
+gene.id <- readRDS(paste0(processed_dir, '/highly_expressed_genes.rds'))   # 10614
 gene.ensembl.id <- lapply(strsplit(gene.id, '[.]'), function(x){x[1]}) %>% unlist()
 ensembl <- biomaRt::useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
 ensembl.37 <- biomaRt::useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", GRCh = 37)
@@ -68,21 +68,22 @@ chr.select = intersect(gRNA.mart$chr, gene.mart$chr)
 
 select.gRNA.gene.pair = NULL
 
-for(chr in chr.select){
+for (chr in chr.select) {
   gene.chr.id = which(gene.mart$chr == chr)
   gRNA.chr.id = which(gRNA.mart$chr == chr)
   gene.pos.start = gene.mart$start_position[gene.chr.id]
   gene.pos.end = gene.mart$end_position[gene.chr.id]
   gene.tss = gene.pos.start
-  gene.tss[gene.mart$strand == -1] = gene.pos.end[gene.mart$strand == -1]
+  # gene.tss[gene.mart$strand == -1] = gene.pos.end[gene.mart$strand == -1]
   gRNA.pos = gRNA.mart$mid_position[gRNA.chr.id]
-  distance.temp = sapply(gene.tss, function(x){x - gRNA.pos})
-  temp.id = which(abs(distance.temp) < 1000000, arr.ind = T)
+  distance.temp = sapply(gene.tss, function(x) {abs(x - gRNA.pos)})
+  temp.id = which(distance.temp < 1000000, arr.ind = T)
   select.gRNA.gene.pair = rbind(select.gRNA.gene.pair,
                                 data.frame(gene.id = gene.mart$original.id[gene.chr.id[temp.id[, 2]]],
                                            gRNA.id = gRNA.id[gRNA.chr.id[temp.id[, 1]]]))
   # cat(chr, ' is done! \n')
 }
+
 dim(select.gRNA.gene.pair)
 # 2376 pairs of gene and gRNA
 saveRDS(select.gRNA.gene.pair, file = paste0(processed_dir, "/select_gRNA_gene_pair.rds"))