From 95d9d658cb04fb54a2ceb3c039dc2d722a4acbb9 Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Fri, 11 Aug 2023 11:01:20 +0200 Subject: [PATCH] Added numeric output option to load_fasta() (#25) --- R/load_fasta.R | 18 +++++++++++++----- man/load_fasta.Rd | 4 ++-- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/R/load_fasta.R b/R/load_fasta.R index 8007471..44743de 100644 --- a/R/load_fasta.R +++ b/R/load_fasta.R @@ -4,7 +4,7 @@ #' running the hierBAPS algorithm. #' #' @param msa Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered -#' @param keep.singletons A logical indicating whether to consider singleton mutations in calculating the clusters +#' @param keep_singletons A logical indicating whether to consider singleton mutations in calculating the clusters #' #' @return A character matrix with filtered SNP data #' @@ -15,7 +15,7 @@ #' @seealso rhierbaps::load_fasta #' @importFrom ape read.FASTA as.DNAbin #' @export -load_fasta <- function(msa, keep.singletons = FALSE) { +load_fasta <- function(msa, keep_singletons = FALSE, output_numbers = TRUE) { # Check inputs if (is(msa, "character")) { @@ -28,7 +28,9 @@ load_fasta <- function(msa, keep.singletons = FALSE) { } else { stop("incorrect input for msa!") } - if (!is.logical(keep.singletons)) stop("Invalid keep.singletons! Must be on of TRUE/FALSE.") + if (!is.logical(keep_singletons)) { + stop("Invalid keep_singletons! Must be one of TRUE/FALSE.") + } # Load sequences using ape. This does a lot of the checking for us. seq_names <- labels(seqs) @@ -46,8 +48,8 @@ load_fasta <- function(msa, keep.singletons = FALSE) { conserved <- colSums(t(t(seqs) == seqs[1, ])) == nrow(seqs) seqs <- seqs[, !conserved] - if (!keep.singletons) { - # remove singletons as they are uninformative in the algorithm + if (!keep_singletons) { + # remove_singletons as they are uninformative in the algorithm is_singleton <- apply(seqs, 2, function(x) { tab <- table(x) return(x %in% names(tab)[tab == 1]) @@ -58,5 +60,11 @@ load_fasta <- function(msa, keep.singletons = FALSE) { # Convert gaps and unknowns to same symbol seqs[seqs == "n"] <- "-" + # Replace letters with numbers, dashes with zeros + if (output_numbers) { + seqs <- matrix(match(seqs, c("a", "c", "g", "t")), nrow(seqs)) + seqs[is.na(seqs)] <- 0 + } + return(seqs) } diff --git a/man/load_fasta.Rd b/man/load_fasta.Rd index 4f47322..58285b2 100644 --- a/man/load_fasta.Rd +++ b/man/load_fasta.Rd @@ -4,12 +4,12 @@ \alias{load_fasta} \title{load_fasta} \usage{ -load_fasta(msa, keep.singletons = FALSE) +load_fasta(msa, keep_singletons = FALSE, output_numbers = TRUE) } \arguments{ \item{msa}{Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered} -\item{keep.singletons}{A logical indicating whether to consider singleton mutations in calculating the clusters} +\item{keep_singletons}{A logical indicating whether to consider singleton mutations in calculating the clusters} } \value{ A character matrix with filtered SNP data