Added numeric output option to load_fasta() (#25)

This commit is contained in:
Waldir Leoncio 2023-08-11 11:01:20 +02:00
parent a51816d5c0
commit 95d9d658cb
2 changed files with 15 additions and 7 deletions

View file

@ -4,7 +4,7 @@
#' running the hierBAPS algorithm. #' running the hierBAPS algorithm.
#' #'
#' @param msa Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered #' @param msa Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered
#' @param keep.singletons A logical indicating whether to consider singleton mutations in calculating the clusters #' @param keep_singletons A logical indicating whether to consider singleton mutations in calculating the clusters
#' #'
#' @return A character matrix with filtered SNP data #' @return A character matrix with filtered SNP data
#' #'
@ -15,7 +15,7 @@
#' @seealso rhierbaps::load_fasta #' @seealso rhierbaps::load_fasta
#' @importFrom ape read.FASTA as.DNAbin #' @importFrom ape read.FASTA as.DNAbin
#' @export #' @export
load_fasta <- function(msa, keep.singletons = FALSE) { load_fasta <- function(msa, keep_singletons = FALSE, output_numbers = TRUE) {
# Check inputs # Check inputs
if (is(msa, "character")) { if (is(msa, "character")) {
@ -28,7 +28,9 @@ load_fasta <- function(msa, keep.singletons = FALSE) {
} else { } else {
stop("incorrect input for msa!") stop("incorrect input for msa!")
} }
if (!is.logical(keep.singletons)) stop("Invalid keep.singletons! Must be on of TRUE/FALSE.") if (!is.logical(keep_singletons)) {
stop("Invalid keep_singletons! Must be one of TRUE/FALSE.")
}
# Load sequences using ape. This does a lot of the checking for us. # Load sequences using ape. This does a lot of the checking for us.
seq_names <- labels(seqs) seq_names <- labels(seqs)
@ -46,8 +48,8 @@ load_fasta <- function(msa, keep.singletons = FALSE) {
conserved <- colSums(t(t(seqs) == seqs[1, ])) == nrow(seqs) conserved <- colSums(t(t(seqs) == seqs[1, ])) == nrow(seqs)
seqs <- seqs[, !conserved] seqs <- seqs[, !conserved]
if (!keep.singletons) { if (!keep_singletons) {
# remove singletons as they are uninformative in the algorithm # remove_singletons as they are uninformative in the algorithm
is_singleton <- apply(seqs, 2, function(x) { is_singleton <- apply(seqs, 2, function(x) {
tab <- table(x) tab <- table(x)
return(x %in% names(tab)[tab == 1]) return(x %in% names(tab)[tab == 1])
@ -58,5 +60,11 @@ load_fasta <- function(msa, keep.singletons = FALSE) {
# Convert gaps and unknowns to same symbol # Convert gaps and unknowns to same symbol
seqs[seqs == "n"] <- "-" seqs[seqs == "n"] <- "-"
# Replace letters with numbers, dashes with zeros
if (output_numbers) {
seqs <- matrix(match(seqs, c("a", "c", "g", "t")), nrow(seqs))
seqs[is.na(seqs)] <- 0
}
return(seqs) return(seqs)
} }

View file

@ -4,12 +4,12 @@
\alias{load_fasta} \alias{load_fasta}
\title{load_fasta} \title{load_fasta}
\usage{ \usage{
load_fasta(msa, keep.singletons = FALSE) load_fasta(msa, keep_singletons = FALSE, output_numbers = TRUE)
} }
\arguments{ \arguments{
\item{msa}{Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered} \item{msa}{Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered}
\item{keep.singletons}{A logical indicating whether to consider singleton mutations in calculating the clusters} \item{keep_singletons}{A logical indicating whether to consider singleton mutations in calculating the clusters}
} }
\value{ \value{
A character matrix with filtered SNP data A character matrix with filtered SNP data