correlation_batch

tf2 · Mar 1, 2023 · 88a9fdc · 88a9fdc
1 parent 0561936
commit 88a9fdc
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 3 deletions.
diff --git a/src/Rbin/DESCRIPTION b/src/Rbin/DESCRIPTION
@@ -9,4 +9,4 @@ Depends: ViteRbi, mixtools
 Description: Super Fast File Indexing and Access 
 License: Artistic
 NeedsCompilation: yes
-Packaged: 2020-10-22 21:24:51 UTC; tomas
+Packaged: 2023-02-28 19:00:41 UTC; tomas
diff --git a/src/Rbin/R/Rbin.R b/src/Rbin/R/Rbin.R
@@ -341,13 +341,42 @@ generate_correlation_fast <- function(bin_dir, cor_dir, index_file) {
 		   		"PACKAGE" = "Rbin"
 				)$values
 	}
+	rr_mat = rr_mat[index$V1<23,]
 	colnames(rr_mat) = basename(filenames)
 	cor_mat = cor(rr_mat)
 	# TODO: output to each file
-	dataset = data.frame(filenames, co)
-	write.table(dataset, file=cor_file, sep="\t", row.names=F, col.names=F, quote=F)
+	for(x in 1:ncol(cor_mat)) {
+		dataset = data.frame(rownames(cor_mat, cor_mat[,x]))
+		write.table(dataset, file=cor_file, sep="\t", row.names=F, col.names=F, quote=F)
+	}
+}
+
+generate_correlation_batch <- function(bin_dir, cor_dir, batch_size, start_pos, index_file) {
+	# read everything in batches - ideally this should be everything and require big mem
+	index = read.table(index_file)
+	filenames = dir(bin_dir, full.names=TRUE)
+	filenames = filenames[start_pos:(start_pos+batch_size)]
+	rr_mat = matrix(0, nrow=nrow(index), ncol=length(filenames))
+	for(x in 1:length(filenames)) {
+		rr_mat[, x] = .C( "getValues",
+				"filename" = filenames[x],
+		 		"start_position" = as.double(0), 
+		  		"number_row_to_read" =as.integer(nrow(index)),
+		   		"values" = as.double(1:nrow(index)),
+		   		"PACKAGE" = "Rbin"
+				)$values
+	}
+	rr_mat = rr_mat[index$V1<23,]
+	colnames(rr_mat) = basename(filenames)
+	cor_mat = cor(rr_mat)
+	for(x in 1:ncol(cor_mat)) {
+		dataset = data.frame(rownames(cor_mat), cor_mat[,x])
+		cor_file = paste0(cor_dir, "/", colnames(cor_mat)[x])
+		write.table(dataset, file=cor_file, sep="\t", row.names=F, col.names=F, quote=F)
+	}
 }
 
+
 ## This function returns sample_names based on type (mixed, matched and mismatched)
 ## Will be used in get_references() and run_hmm_rbin()
 `get_ref_sample_names_by_type` <- function(sample_name, cor_dir, gender_file, batch_size = 5000, cor_cut = 0.9, skip_em=FALSE, type="mixed") {