#' load_fasta #' #' Loads a fasta file into matrix format ready for #' running the hierBAPS algorithm. #' #' @param msa Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered #' @param keep.singletons A logical indicating whether to consider singleton mutations in calculating the clusters #' #' @return A character matrix with filtered SNP data #' #' @examples #' msa <- system.file("ext", "seqs.fa", package = "rBAPS") #' snp.matrix <- load_fasta(msa) #' @author Gerry Tonkin-Hill, Waldir Leoncio #' @seealso rhierbaps::load_fasta #' @importFrom ape read.FASTA as.DNAbin #' @export load_fasta <- function(msa, keep.singletons = FALSE) { # Check inputs if (class(msa) == "character") { if (!file.exists(msa)) stop("Invalid msa or the file does not exist!") seqs <- ape::read.FASTA(msa) } else if (class(msa) == "matrix") { seqs <- ape::as.DNAbin(msa) } else if (class(msa) == "DNAbin") { seqs <- msa } else { stop("incorrect input for msa!") } if (!is.logical(keep.singletons)) stop("Invalid keep.singletons! Must be on of TRUE/FALSE.") # Load sequences using ape. This does a lot of the checking for us. seq_names <- labels(seqs) seqs <- as.character(as.matrix(seqs)) rownames(seqs) <- seq_names seqs[is.na(seqs)] <- "-" # Validation ----------------------------------------------------------------- if (nrow(seqs) < 3) stop("Less than 3 sequences!") if (any(!(as.vector(tolower(seqs)) %in% c("a", "c", "g", "t", "n", "-")))) { warning("Characters not in acgtnACGTN- will be treated as missing (-)...") } # Remove conserved columns conserved <- colSums(t(t(seqs) == seqs[1, ])) == nrow(seqs) seqs <- seqs[, !conserved] if (!keep.singletons) { # remove singletons as they are uninformative in the algorithm is_singleton <- apply(seqs, 2, function(x) { tab <- table(x) return(x %in% names(tab)[tab == 1]) }) seqs[is_singleton] <- "-" } # Convert gaps and unknowns to same symbol seqs[seqs == "n"] <- "-" return(seqs) }