Added load_fasta from rhierbaps
This commit is contained in:
parent
8956949bff
commit
50ec8618cc
1 changed files with 56 additions and 0 deletions
56
R/load_fasta.R
Normal file
56
R/load_fasta.R
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
#' load_fasta
|
||||
#'
|
||||
#' Loads a fasta file into matrix format ready for
|
||||
#' running the hierBAPS algorithm.
|
||||
#'
|
||||
#' @param msa Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered
|
||||
#' @param keep.singletons A logical indicating whether to consider singleton mutations in calculating the clusters
|
||||
#'
|
||||
#' @return A character matrix with filtered SNP data
|
||||
#'
|
||||
#' @examples
|
||||
#' msa <- system.file("extdata", "seqs.fa", package = "rhierbaps")
|
||||
#' snp.matrix <- load_fasta(msa)
|
||||
#' @export
|
||||
load_fasta <- function(msa, keep.singletons=FALSE) {
|
||||
|
||||
#Check inputs
|
||||
if(class(msa)=="character"){
|
||||
if (!file.exists(msa)) stop("Invalid msa or the file does not exist!")
|
||||
seqs <- ape::read.FASTA(msa)
|
||||
} else if(class(msa)=="matrix"){
|
||||
seqs <- ape::as.DNAbin(msa)
|
||||
} else if(class(msa)=="DNAbin"){
|
||||
seqs <- msa
|
||||
} else{
|
||||
stop("incorrect input for msa!")
|
||||
}
|
||||
if (!is.logical(keep.singletons)) stop("Invalid keep.singletons! Must be on of TRUE/FALSE.")
|
||||
|
||||
#Load sequences using ape. This does a lot of the checking for us.
|
||||
seq_names <- labels(seqs)
|
||||
seqs <- as.character(as.matrix(seqs))
|
||||
rownames(seqs) <- seq_names
|
||||
seqs[is.na(seqs)] <- "-"
|
||||
|
||||
if (nrow(seqs)<3) stop("Less than 3 sequences!")
|
||||
warning("Characters not in acgtnACGTN- will be treated as missing (-)...")
|
||||
|
||||
#Remove conserved columns
|
||||
conserved <- colSums(t(t(seqs)==seqs[1,]))==nrow(seqs)
|
||||
seqs <- seqs[, !conserved]
|
||||
|
||||
if(!keep.singletons){
|
||||
#remove singletons as they are uninformative in the algorithm
|
||||
is_singleton <- apply(seqs, 2, function(x){
|
||||
tab <- table(x)
|
||||
return(x %in% names(tab)[tab==1])
|
||||
})
|
||||
seqs[is_singleton] <- "-"
|
||||
}
|
||||
|
||||
#Convert gaps and unknowns to same symbol
|
||||
seqs[seqs=="n"] <- "-"
|
||||
|
||||
return(seqs)
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue