Skip to content

Commit

Permalink
Merge pull request #652 from green-striped-gecko/dev_luis
Browse files Browse the repository at this point in the history
Dev luis
  • Loading branch information
mijangos81 authored Dec 12, 2023
2 parents 2e8ad29 + 6a0dc4b commit 39e6bcc
Show file tree
Hide file tree
Showing 8 changed files with 425 additions and 35 deletions.
190 changes: 181 additions & 9 deletions R/gl.read.vcf.r
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
#' Converts a vcf file into a genlight object
#'
#' This function needs package vcfR, please install it. The converted genlight
#' object does not have individual metrics. You need to add them 'manually' to
#' the other$ind.metrics slot.
#' This function needs package vcfR, please install it.
#' @param vcffile A vcf file (works only for diploid data) [required].
#' @param ind.metafile Optional file in csv format with metadata for each
#' individual (see details for explanation) [default NULL].
#' @param verbose Verbosity: 0, silent or fatal errors; 1, begin and end; 2,
#' progress log; 3, progress and results summary; 5, full report
#' [default 2, unless specified using gl.set.verbosity].
#' @details
#' The ind.metadata file needs to have very specific headings. First a heading
#' called id. Here the ids have to match the ids in the dartR object.
#' The following column headings are optional.
#' pop: specifies the population membership of each individual. lat and lon
#' specify spatial coordinates (in decimal degrees WGS1984 format). Additional
#' columns with individual metadata can be imported (e.g. age, gender).
#' @return A genlight object.
#' @export
#' @author Bernd Gruber (Post to \url{https://groups.google.com/d/forum/dartr})
Expand All @@ -16,6 +23,7 @@
#' }

gl.read.vcf <- function(vcffile,
ind.metafile = NULL,
verbose = NULL) {
# SET VERBOSITY
verbose <- gl.check.verbosity(verbose)
Expand All @@ -36,14 +44,16 @@ gl.read.vcf <- function(vcffile,
" needed for this function to work. Please install it.\n"
))
return(-1)
} else {
}

vcf <- vcfR::read.vcfR(file = vcffile, verbose = verbose)
myRef <- vcfR::getREF(vcf)
myAlt <- vcfR::getALT(vcf)
chrom <- vcfR::getCHROM(vcf)
pos <- vcfR::getPOS(vcf)
loc.all <- paste0(myRef,"/",myAlt)
x <- vcfR::vcfR2genlight(vcf)
x@loc.all <- loc.all


# adding SNP information from VCF
info_tmp_1 <- vcf@fix[,6:7]
info_tmp_2 <- vcfR::getINFO(vcf)
Expand All @@ -54,7 +64,15 @@ gl.read.vcf <- function(vcffile,
info_tmp_2 <- as.data.frame(do.call(rbind,stringr::str_split(info_tmp_2,pattern = "=|;")))
info <- info_tmp_2[,seq(2,ncol(info_tmp_2),2)]
info <- cbind(info_tmp_1,info)
colnames(info) <- c("QUAL","FILTER",unname(unlist(info_tmp_2[1,seq(1,ncol(info_tmp_2),2)])))
col.names.info <- c("QUAL","FILTER",unname(unlist(info_tmp_2[1,seq(1,ncol(info_tmp_2),2)])))
if(length(col.names.info)!= length(colnames(info))){
message(warn(
" Locus information is not formatted correctly. One reason could be that a field could have missing values."))
info <- info_tmp_1
colnames(info) <- c("QUAL","FILTER")
}else{
colnames(info) <- col.names.info
}
}

# identify which SNPs have more than 2 alleles
Expand All @@ -63,19 +81,174 @@ gl.read.vcf <- function(vcffile,
if(length(more_alleles)!=0){
info <- info[-more_alleles,]
info[] <- lapply(info, as.numeric)
x@loc.all <- loc.all[-more_alleles]
x@chromosome <- as.factor(chrom[-more_alleles])
x@position <- pos[-more_alleles]
}else{
x@loc.all <- loc.all
x@chromosome <- as.factor(chrom)
x@position <- pos
}
}else{
ALT <- vcfR::getALT(vcf)
more_alleles <- which(stringr::str_length(ALT) >1)
more_alleles <- grep(pattern = ",",ALT)
if(length(more_alleles)>0){
info <- info[-more_alleles,]
x@loc.all <- loc.all[-more_alleles]
x@chromosome <- as.factor(chrom[-more_alleles])
x@position <- pos[-more_alleles]
}else{
x@loc.all <- loc.all
x@chromosome <- as.factor(chrom)
x@position <- pos
}
}

ploidy(x) <- 2
x <- gl.compliance.check(x)

x$other$loc.metrics <- cbind(x$other$loc.metrics,info)
x$other$loc.metrics$QUAL <- as.numeric(x$other$loc.metrics$QUAL)

# additional metadata and long lat to the data file are stored in other

if (!is.null(ind.metafile)) {
if (verbose >= 2) {
cat(report(
paste("Adding individual metrics:", ind.metafile, ".\n")
))
}
###### population and individual file to link numbers to populations...
ind.cov <- read.csv(ind.metafile,
header = TRUE,
stringsAsFactors = TRUE)
# is there an entry for every individual

id.col <- match("id", names(ind.cov))

if (is.na(id.col)) {
stop(error("Fatal Error: There is no id column\n"))
} else {
ind.cov[, id.col] <-
trimws(ind.cov[, id.col], which = "both") #trim spaces

if (length(ind.cov[, id.col]) != length(unique(ind.cov[, id.col]))) {
cat(error(
"Individual names are not unique. You need to change them!\n"
))
stop()
}

# reorder
if (length(ind.cov[, id.col]) != length(indNames(x))) {
cat(
warn(
"Ids for individual metadata does not match the number of ids in the SNP data file. Maybe this is fine if a subset matches.\n"
)
)
nam.indmeta <- ind.cov[, id.col]
nam.dart <- indNames(x)

nm.indmeta <- nam.indmeta[!nam.indmeta %in% nam.dart]
nm.inddart <- nam.dart[!nam.dart %in% nam.indmeta]
if (length(nm.indmeta) > 0) {
cat(warn("ind.metafile ids not matched were:\n"))
print(nm.indmeta)
}
if (length(nm.inddart) > 0) {
cat(warn("DArT file ids not matched were:\n"))
print(nm.inddart)
}
}

ord <- match(indNames(x), ind.cov[, id.col])
ord <- ord[!is.na(ord)]

if (length(ord) > 1 & length(ord) <= nInd(x)) {
if (verbose >= 2) {
cat(report(
paste(
" Ids for individual metadata (at least a subset of) are matching!\n"
)
))
cat(report(
paste(
" Found ",
length(ord == nInd(x)),
"matching ids out of",
nrow(ind.cov),
"ids provided in the ind.metadata file.\n "
)
))
}
ord2 <- match(ind.cov[ord, id.col], indNames(x))
x <- x[ord2, ]
} else {
stop(error(
"Fatal Error: Individual ids are not matching!!!!\n"
))
}
}

pop.col <- match("pop", names(ind.cov))

if (is.na(pop.col)) {
if (verbose >= 1) {
cat(
warn(
" Warning: There is no pop column, created one with all pop1 as default for all individuals\n"
)
)
}
pop(x) <- factor(rep("pop1", nInd(x)))
} else {
pop(x) <- as.factor(ind.cov[ord, pop.col])
if (verbose >= 2) {
cat(report(" Added population assignments.\n"))
}
}

lat.col <- match("lat", names(ind.cov))
lon.col <- match("lon", names(ind.cov))
if (verbose >= 2) {
if (is.na(lat.col)) {
cat(
warn(
"Warning: Individual metrics do not include a latitude (lat) column\n"
)
)
}
if (is.na(lon.col)) {
cat(
warn(
"Warning: Individual metrics do not include a longitude (lon) column\n"
)
)
}
}
if (!is.na(lat.col) & !is.na(lon.col)) {
x@other$latlon <- ind.cov[ord, c(lat.col, lon.col)]
rownames(x@other$latlon) <- ind.cov[ord, id.col]
if (verbose >= 2) {
cat(report(" Added latlon data.\n"))
}
}

other.col <- names(ind.cov)
if (length(other.col) > 0) {
x@other$ind.metrics <- ind.cov[ord, other.col, drop = FALSE]
rownames(x@other$ind.metrics) <- ind.cov[ord, id.col]
if (verbose >= 2) {
cat(report(
paste(
" Added ",
other.col,
" to the other$ind.metrics slot.\n"
)
))
}
}
}

# add history
x@other$history <- list(match.call())
Expand All @@ -89,6 +262,5 @@ gl.read.vcf <- function(vcffile,
)
}
return(x)
}

}
5 changes: 3 additions & 2 deletions R/gl.report.replicates.r
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ gl.report.replicates <- function(x,
ind_list <- NULL

for (y in 1:nrow(unique_ind_tmp)) {
if (unique_ind_tmp[y, "test_stat"] == TRUE) {
if (unique_ind_tmp[y, "test_stat"] == FALSE) {
ind_tmp <- unique_ind_tmp[y, "ind1"]
} else{
ind_tmp <- unique_ind_tmp[y, "ind2"]
Expand All @@ -168,9 +168,10 @@ gl.report.replicates <- function(x,
# getting list of replicated individuals

ind_mat <- as.matrix(reshape2::acast(unique_ind, ind1~ind2, value.var="perc"))
# ind_mat[lower.tri(ind_mat,diag = TRUE)] <- NA

ind_list_rep <- apply(ind_mat,2,function(x){
x[which(!is.na(x))]
names(x[which(!is.na(x))])
})

# Histograms
Expand Down
4 changes: 3 additions & 1 deletion R/gl.run.EMIBD9.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ gl.run.EMIBD9 <- function(x,
system("EM_IBD_P.exe INP:MyData.par")
}


x_lines <- readLines("EMIBD9_Res.ibd9")
strt <- which(grepl( "^IBD",x_lines)) + 2
stp <- which(grepl("Indiv genotypes",x_lines)) - 4
Expand All @@ -135,6 +134,9 @@ gl.run.EMIBD9 <- function(x,
colnames(res) <- restore_names_2$id
rownames(res) <- restore_names_2$id

order_mat <- colnames(res)[order(colnames(res))]
res <- res[order_mat, order_mat]

return(res)

}
Expand Down
Loading

0 comments on commit 39e6bcc

Please sign in to comment.