Added numeric output option to load_fasta() (#25)

2023-08-11 11:01:20 +02:00 · 2023-08-11 11:01:20 +02:00 · 95d9d658cb
commit 95d9d658cb
parent a51816d5c0
2 changed files with 15 additions and 7 deletions
--- a/R/load_fasta.R
+++ b/R/load_fasta.R
@ -4,7 +4,7 @@
 #' running the hierBAPS algorithm.
 #'
 #' @param msa Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered
-#' @param keep.singletons A logical indicating whether to consider singleton mutations in calculating the clusters
+#' @param keep_singletons A logical indicating whether to consider singleton mutations in calculating the clusters
 #'
 #' @return A character matrix with filtered SNP data
 #'
@ -15,7 +15,7 @@
 #' @seealso rhierbaps::load_fasta
 #' @importFrom ape read.FASTA as.DNAbin
 #' @export
-load_fasta <- function(msa, keep.singletons = FALSE) {
+load_fasta <- function(msa, keep_singletons = FALSE, output_numbers = TRUE) {

  # Check inputs
  if (is(msa, "character")) {
@ -28,7 +28,9 @@ load_fasta <- function(msa, keep.singletons = FALSE) {
  } else {
    stop("incorrect input for msa!")
  }
-  if (!is.logical(keep.singletons)) stop("Invalid keep.singletons! Must be on of TRUE/FALSE.")
+  if (!is.logical(keep_singletons)) {
+    stop("Invalid keep_singletons! Must be one of TRUE/FALSE.")
+  }

  # Load sequences using ape. This does a lot of the checking for us.
  seq_names <- labels(seqs)
@ -46,8 +48,8 @@ load_fasta <- function(msa, keep.singletons = FALSE) {
  conserved <- colSums(t(t(seqs) == seqs[1, ])) == nrow(seqs)
  seqs <- seqs[, !conserved]

-  if (!keep.singletons) {
-    # remove singletons as they are uninformative in the algorithm
+  if (!keep_singletons) {
+    # remove_singletons as they are uninformative in the algorithm
    is_singleton <- apply(seqs, 2, function(x) {
      tab <- table(x)
      return(x %in% names(tab)[tab == 1])
@ -58,5 +60,11 @@ load_fasta <- function(msa, keep.singletons = FALSE) {
  # Convert gaps and unknowns to same symbol
  seqs[seqs == "n"] <- "-"

+  # Replace letters with numbers, dashes with zeros
+  if (output_numbers) {
+    seqs <- matrix(match(seqs, c("a", "c", "g", "t")), nrow(seqs))
+    seqs[is.na(seqs)] <- 0
+  }
+
  return(seqs)
 }
--- a/man/load_fasta.Rd
+++ b/man/load_fasta.Rd
@ -4,12 +4,12 @@
 \alias{load_fasta}
 \title{load_fasta}
 \usage{
-load_fasta(msa, keep.singletons = FALSE)
+load_fasta(msa, keep_singletons = FALSE, output_numbers = TRUE)
 }
 \arguments{
 \item{msa}{Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered}

-\item{keep.singletons}{A logical indicating whether to consider singleton mutations in calculating the clusters}
+\item{keep_singletons}{A logical indicating whether to consider singleton mutations in calculating the clusters}
 }
 \value{
 A character matrix with filtered SNP data