Added numeric output option to load_fasta() (#25)

This commit is contained in:
Waldir Leoncio 2023-08-11 11:01:20 +02:00
parent a51816d5c0
commit 95d9d658cb
2 changed files with 15 additions and 7 deletions

View file

@ -4,7 +4,7 @@
#' running the hierBAPS algorithm.
#'
#' @param msa Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered
#' @param keep.singletons A logical indicating whether to consider singleton mutations in calculating the clusters
#' @param keep_singletons A logical indicating whether to consider singleton mutations in calculating the clusters
#'
#' @return A character matrix with filtered SNP data
#'
@ -15,7 +15,7 @@
#' @seealso rhierbaps::load_fasta
#' @importFrom ape read.FASTA as.DNAbin
#' @export
load_fasta <- function(msa, keep.singletons = FALSE) {
load_fasta <- function(msa, keep_singletons = FALSE, output_numbers = TRUE) {
# Check inputs
if (is(msa, "character")) {
@ -28,7 +28,9 @@ load_fasta <- function(msa, keep.singletons = FALSE) {
} else {
stop("incorrect input for msa!")
}
if (!is.logical(keep.singletons)) stop("Invalid keep.singletons! Must be on of TRUE/FALSE.")
if (!is.logical(keep_singletons)) {
stop("Invalid keep_singletons! Must be one of TRUE/FALSE.")
}
# Load sequences using ape. This does a lot of the checking for us.
seq_names <- labels(seqs)
@ -46,8 +48,8 @@ load_fasta <- function(msa, keep.singletons = FALSE) {
conserved <- colSums(t(t(seqs) == seqs[1, ])) == nrow(seqs)
seqs <- seqs[, !conserved]
if (!keep.singletons) {
# remove singletons as they are uninformative in the algorithm
if (!keep_singletons) {
# remove_singletons as they are uninformative in the algorithm
is_singleton <- apply(seqs, 2, function(x) {
tab <- table(x)
return(x %in% names(tab)[tab == 1])
@ -58,5 +60,11 @@ load_fasta <- function(msa, keep.singletons = FALSE) {
# Convert gaps and unknowns to same symbol
seqs[seqs == "n"] <- "-"
# Replace letters with numbers, dashes with zeros
if (output_numbers) {
seqs <- matrix(match(seqs, c("a", "c", "g", "t")), nrow(seqs))
seqs[is.na(seqs)] <- 0
}
return(seqs)
}

View file

@ -4,12 +4,12 @@
\alias{load_fasta}
\title{load_fasta}
\usage{
load_fasta(msa, keep.singletons = FALSE)
load_fasta(msa, keep_singletons = FALSE, output_numbers = TRUE)
}
\arguments{
\item{msa}{Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered}
\item{keep.singletons}{A logical indicating whether to consider singleton mutations in calculating the clusters}
\item{keep_singletons}{A logical indicating whether to consider singleton mutations in calculating the clusters}
}
\value{
A character matrix with filtered SNP data