ourMELONS/R/load_fasta.R

63 lines
2 KiB
R
Raw Normal View History

2021-08-23 06:32:46 +02:00
#' load_fasta
#'
#' Loads a fasta file into matrix format ready for
#' running the hierBAPS algorithm.
#'
#' @param msa Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered
#' @param keep.singletons A logical indicating whether to consider singleton mutations in calculating the clusters
#'
#' @return A character matrix with filtered SNP data
#'
#' @examples
#' msa <- system.file("ext", "seqs.fa", package = "rBAPS")
2021-08-23 06:32:46 +02:00
#' snp.matrix <- load_fasta(msa)
2021-09-03 09:09:09 +02:00
#' @author Gerry Tonkin-Hill, Waldir Leoncio
#' @seealso rhierbaps::load_fasta
2021-08-23 14:34:20 +02:00
#' @importFrom ape read.FASTA as.DNAbin
2021-08-23 06:32:46 +02:00
#' @export
load_fasta <- function(msa, keep.singletons = FALSE) {
2021-08-23 06:32:46 +02:00
# Check inputs
if (class(msa) == "character") {
2021-08-23 06:32:46 +02:00
if (!file.exists(msa)) stop("Invalid msa or the file does not exist!")
seqs <- ape::read.FASTA(msa)
} else if (class(msa) == "matrix") {
2021-08-23 06:32:46 +02:00
seqs <- ape::as.DNAbin(msa)
} else if (class(msa) == "DNAbin") {
2021-08-23 06:32:46 +02:00
seqs <- msa
} else {
2021-08-23 06:32:46 +02:00
stop("incorrect input for msa!")
}
if (!is.logical(keep.singletons)) stop("Invalid keep.singletons! Must be on of TRUE/FALSE.")
# Load sequences using ape. This does a lot of the checking for us.
2021-08-23 06:32:46 +02:00
seq_names <- labels(seqs)
seqs <- as.character(as.matrix(seqs))
rownames(seqs) <- seq_names
seqs[is.na(seqs)] <- "-"
2021-09-03 09:09:09 +02:00
# Validation -----------------------------------------------------------------
if (nrow(seqs) < 3) stop("Less than 3 sequences!")
if (any(!(as.vector(tolower(seqs)) %in% c("a", "c", "g", "t", "n", "-")))) {
warning("Characters not in acgtnACGTN- will be treated as missing (-)...")
}
2021-08-23 06:32:46 +02:00
# Remove conserved columns
conserved <- colSums(t(t(seqs) == seqs[1, ])) == nrow(seqs)
2021-08-23 06:32:46 +02:00
seqs <- seqs[, !conserved]
if (!keep.singletons) {
# remove singletons as they are uninformative in the algorithm
is_singleton <- apply(seqs, 2, function(x) {
2021-08-23 06:32:46 +02:00
tab <- table(x)
return(x %in% names(tab)[tab == 1])
2021-08-23 06:32:46 +02:00
})
seqs[is_singleton] <- "-"
}
# Convert gaps and unknowns to same symbol
seqs[seqs == "n"] <- "-"
2021-08-23 06:32:46 +02:00
return(seqs)
}