From 50ec8618cc00ba0f5cc65caf40f3d30049bf417a Mon Sep 17 00:00:00 2001 From: Gerry Tonkin-Hill Date: Mon, 23 Aug 2021 06:32:46 +0200 Subject: [PATCH] Added load_fasta from rhierbaps --- R/load_fasta.R | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 R/load_fasta.R diff --git a/R/load_fasta.R b/R/load_fasta.R new file mode 100644 index 0000000..53e5125 --- /dev/null +++ b/R/load_fasta.R @@ -0,0 +1,56 @@ +#' load_fasta +#' +#' Loads a fasta file into matrix format ready for +#' running the hierBAPS algorithm. +#' +#' @param msa Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered +#' @param keep.singletons A logical indicating whether to consider singleton mutations in calculating the clusters +#' +#' @return A character matrix with filtered SNP data +#' +#' @examples +#' msa <- system.file("extdata", "seqs.fa", package = "rhierbaps") +#' snp.matrix <- load_fasta(msa) +#' @export +load_fasta <- function(msa, keep.singletons=FALSE) { + + #Check inputs + if(class(msa)=="character"){ + if (!file.exists(msa)) stop("Invalid msa or the file does not exist!") + seqs <- ape::read.FASTA(msa) + } else if(class(msa)=="matrix"){ + seqs <- ape::as.DNAbin(msa) + } else if(class(msa)=="DNAbin"){ + seqs <- msa + } else{ + stop("incorrect input for msa!") + } + if (!is.logical(keep.singletons)) stop("Invalid keep.singletons! Must be on of TRUE/FALSE.") + + #Load sequences using ape. This does a lot of the checking for us. + seq_names <- labels(seqs) + seqs <- as.character(as.matrix(seqs)) + rownames(seqs) <- seq_names + seqs[is.na(seqs)] <- "-" + + if (nrow(seqs)<3) stop("Less than 3 sequences!") + warning("Characters not in acgtnACGTN- will be treated as missing (-)...") + + #Remove conserved columns + conserved <- colSums(t(t(seqs)==seqs[1,]))==nrow(seqs) + seqs <- seqs[, !conserved] + + if(!keep.singletons){ + #remove singletons as they are uninformative in the algorithm + is_singleton <- apply(seqs, 2, function(x){ + tab <- table(x) + return(x %in% names(tab)[tab==1]) + }) + seqs[is_singleton] <- "-" + } + + #Convert gaps and unknowns to same symbol + seqs[seqs=="n"] <- "-" + + return(seqs) +}