2021-08-23 06:32:46 +02:00
|
|
|
#' load_fasta
|
|
|
|
|
#'
|
|
|
|
|
#' Loads a fasta file into matrix format ready for
|
|
|
|
|
#' running the hierBAPS algorithm.
|
|
|
|
|
#'
|
|
|
|
|
#' @param msa Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered
|
2023-08-11 11:01:20 +02:00
|
|
|
#' @param keep_singletons A logical indicating whether to consider singleton mutations in calculating the clusters
|
2021-08-23 06:32:46 +02:00
|
|
|
#'
|
|
|
|
|
#' @return A character matrix with filtered SNP data
|
|
|
|
|
#'
|
|
|
|
|
#' @examples
|
2023-08-09 10:54:48 +02:00
|
|
|
#' msa <- system.file("extdata", "seqs.fa", package = "rBAPS")
|
2021-08-23 06:32:46 +02:00
|
|
|
#' snp.matrix <- load_fasta(msa)
|
2021-09-03 09:09:09 +02:00
|
|
|
#' @author Gerry Tonkin-Hill, Waldir Leoncio
|
2021-08-23 06:45:51 +02:00
|
|
|
#' @seealso rhierbaps::load_fasta
|
2021-08-23 14:34:20 +02:00
|
|
|
#' @importFrom ape read.FASTA as.DNAbin
|
2021-08-23 06:32:46 +02:00
|
|
|
#' @export
|
2023-08-11 11:01:20 +02:00
|
|
|
load_fasta <- function(msa, keep_singletons = FALSE, output_numbers = TRUE) {
|
2021-08-23 06:32:46 +02:00
|
|
|
|
2021-11-10 14:02:35 +01:00
|
|
|
# Check inputs
|
2022-05-13 11:22:18 +02:00
|
|
|
if (is(msa, "character")) {
|
2021-08-23 06:32:46 +02:00
|
|
|
if (!file.exists(msa)) stop("Invalid msa or the file does not exist!")
|
|
|
|
|
seqs <- ape::read.FASTA(msa)
|
2022-05-13 11:22:18 +02:00
|
|
|
} else if (is(msa, "matrix")) {
|
2021-08-23 06:32:46 +02:00
|
|
|
seqs <- ape::as.DNAbin(msa)
|
2022-05-13 11:22:18 +02:00
|
|
|
} else if (is(msa, "DNAbin")) {
|
2021-08-23 06:32:46 +02:00
|
|
|
seqs <- msa
|
2021-11-10 14:02:35 +01:00
|
|
|
} else {
|
2021-08-23 06:32:46 +02:00
|
|
|
stop("incorrect input for msa!")
|
|
|
|
|
}
|
2023-08-11 11:01:20 +02:00
|
|
|
if (!is.logical(keep_singletons)) {
|
|
|
|
|
stop("Invalid keep_singletons! Must be one of TRUE/FALSE.")
|
|
|
|
|
}
|
2021-08-23 06:32:46 +02:00
|
|
|
|
2021-11-10 14:02:35 +01:00
|
|
|
# Load sequences using ape. This does a lot of the checking for us.
|
2021-08-23 06:32:46 +02:00
|
|
|
seq_names <- labels(seqs)
|
|
|
|
|
seqs <- as.character(as.matrix(seqs))
|
|
|
|
|
rownames(seqs) <- seq_names
|
|
|
|
|
seqs[is.na(seqs)] <- "-"
|
|
|
|
|
|
2021-09-03 09:09:09 +02:00
|
|
|
# Validation -----------------------------------------------------------------
|
|
|
|
|
if (nrow(seqs) < 3) stop("Less than 3 sequences!")
|
|
|
|
|
if (any(!(as.vector(tolower(seqs)) %in% c("a", "c", "g", "t", "n", "-")))) {
|
|
|
|
|
warning("Characters not in acgtnACGTN- will be treated as missing (-)...")
|
|
|
|
|
}
|
2021-08-23 06:32:46 +02:00
|
|
|
|
2021-11-10 14:02:35 +01:00
|
|
|
# Remove conserved columns
|
|
|
|
|
conserved <- colSums(t(t(seqs) == seqs[1, ])) == nrow(seqs)
|
2021-08-23 06:32:46 +02:00
|
|
|
seqs <- seqs[, !conserved]
|
|
|
|
|
|
2023-08-11 11:01:20 +02:00
|
|
|
if (!keep_singletons) {
|
|
|
|
|
# remove_singletons as they are uninformative in the algorithm
|
2021-11-10 14:02:35 +01:00
|
|
|
is_singleton <- apply(seqs, 2, function(x) {
|
2021-08-23 06:32:46 +02:00
|
|
|
tab <- table(x)
|
2021-11-10 14:02:35 +01:00
|
|
|
return(x %in% names(tab)[tab == 1])
|
2021-08-23 06:32:46 +02:00
|
|
|
})
|
|
|
|
|
seqs[is_singleton] <- "-"
|
|
|
|
|
}
|
|
|
|
|
|
2021-11-10 14:02:35 +01:00
|
|
|
# Convert gaps and unknowns to same symbol
|
|
|
|
|
seqs[seqs == "n"] <- "-"
|
2021-08-23 06:32:46 +02:00
|
|
|
|
2023-08-11 11:01:20 +02:00
|
|
|
# Replace letters with numbers, dashes with zeros
|
|
|
|
|
if (output_numbers) {
|
|
|
|
|
seqs <- matrix(match(seqs, c("a", "c", "g", "t")), nrow(seqs))
|
|
|
|
|
seqs[is.na(seqs)] <- 0
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-23 06:32:46 +02:00
|
|
|
return(seqs)
|
|
|
|
|
}
|