Skip to content

Commit

Permalink
correlation_batch
Browse files Browse the repository at this point in the history
  • Loading branch information
Tomas Fitzgerald committed Mar 1, 2023
1 parent 0561936 commit 88a9fdc
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 3 deletions.
2 changes: 1 addition & 1 deletion src/Rbin/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ Depends: ViteRbi, mixtools
Description: Super Fast File Indexing and Access
License: Artistic
NeedsCompilation: yes
Packaged: 2020-10-22 21:24:51 UTC; tomas
Packaged: 2023-02-28 19:00:41 UTC; tomas
33 changes: 31 additions & 2 deletions src/Rbin/R/Rbin.R
Original file line number Diff line number Diff line change
Expand Up @@ -341,13 +341,42 @@ generate_correlation_fast <- function(bin_dir, cor_dir, index_file) {
"PACKAGE" = "Rbin"
)$values
}
rr_mat = rr_mat[index$V1<23,]
colnames(rr_mat) = basename(filenames)
cor_mat = cor(rr_mat)
# TODO: output to each file
dataset = data.frame(filenames, co)
write.table(dataset, file=cor_file, sep="\t", row.names=F, col.names=F, quote=F)
for(x in 1:ncol(cor_mat)) {
dataset = data.frame(rownames(cor_mat, cor_mat[,x]))
write.table(dataset, file=cor_file, sep="\t", row.names=F, col.names=F, quote=F)
}
}

generate_correlation_batch <- function(bin_dir, cor_dir, batch_size, start_pos, index_file) {
# read everything in batches - ideally this should be everything and require big mem
index = read.table(index_file)
filenames = dir(bin_dir, full.names=TRUE)
filenames = filenames[start_pos:(start_pos+batch_size)]
rr_mat = matrix(0, nrow=nrow(index), ncol=length(filenames))
for(x in 1:length(filenames)) {
rr_mat[, x] = .C( "getValues",
"filename" = filenames[x],
"start_position" = as.double(0),
"number_row_to_read" =as.integer(nrow(index)),
"values" = as.double(1:nrow(index)),
"PACKAGE" = "Rbin"
)$values
}
rr_mat = rr_mat[index$V1<23,]
colnames(rr_mat) = basename(filenames)
cor_mat = cor(rr_mat)
for(x in 1:ncol(cor_mat)) {
dataset = data.frame(rownames(cor_mat), cor_mat[,x])
cor_file = paste0(cor_dir, "/", colnames(cor_mat)[x])
write.table(dataset, file=cor_file, sep="\t", row.names=F, col.names=F, quote=F)
}
}


## This function returns sample_names based on type (mixed, matched and mismatched)
## Will be used in get_references() and run_hmm_rbin()
`get_ref_sample_names_by_type` <- function(sample_name, cor_dir, gender_file, batch_size = 5000, cor_cut = 0.9, skip_em=FALSE, type="mixed") {
Expand Down

0 comments on commit 88a9fdc

Please sign in to comment.