Merge branch 'issue-16' into develop

Basic replacement. Still needs diploid development.
2021-08-23 06:42:41 +02:00 · 2021-08-23 06:42:41 +02:00 · 1a3bdd973e
commit 1a3bdd973e
parent 8956949bff cafd194880
5 changed files with 88 additions and 382 deletions
--- a/1
+++ b/1
@ -34,6 +34,7 @@ Description: Partial R implementation of the BAPS software
    computationally-efficient method for the identification of admixture events
    in genetic population history.
 License: GPL-3
+BugReports: https://github.com/ocbe-uio/rBAPS/issues
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.1.1
--- a/1
+++ b/1
@ -21,6 +21,7 @@ export(laskeMuutokset4)
 export(learn_partition_modified)
 export(learn_simple_partition)
 export(linkage)
+export(load_fasta)
 export(logml2String)
 export(lueGenePopData)
 export(lueNimi)
--- a/R/greedyMix.R
+++ b/R/greedyMix.R
@ -11,386 +11,10 @@ greedyMix <- function(
 	savePreProcessed = NULL,
 	filePreProcessed = NULL
 ) {
-	# ASK: Unclear when fixedk == TRUE. Remove?
-	# check whether fixed k mode is selected
-	# h0 <- findobj('Tag','fixk_menu')
-	# fixedK = get(h0, 'userdata');
-	fixedK <- FALSE
-
-	# if fixedK
-	# 	if ~(fixKWarning == 1) % call function fixKWarning
-	# 		return
-	# 	end
-	# end
-
-	# ASK: ditto
-	# % check whether partition compare mode is selected
-	# h1 = findobj('Tag','partitioncompare_menu');
-	# partitionCompare = get(h1, 'userdata');
-	partitionCompare <- FALSE
-
-	if (is(tietue, "list") | is(tietue, "character")) {
-		# ----------------------------------------------------------------------
-		# Defining type of file
-		# ----------------------------------------------------------------------
-		if (is.null(format)) {
-			input_type <- inputdlg(
-				paste(
-					'Specify the format of your data:\n',
-					'1) BAPS-format\n',
-					'2) GenePop-format\n',
-					'3) Preprocessed data\n'
-				)
-			)
-			# Converting from number into name
-			input_type_name <- switch(
-				input_type, 'BAPS-format', 'GenePop-format', 'Preprocessed data'
-			)
-		} else {
-			input_type_name <- paste0(format, "-format")
-		}
-		if (length(input_type_name) == 0) {
-			stop('Invalid alternative')
-		} else if (input_type_name == 'BAPS-format') {
-			# ------------------------------------------------------------------
-			# Treating BAPS-formatted files
-			# ------------------------------------------------------------------
-			if (!is(tietue, "character")) {
-				pathname_filename <- uigetfile(
-					"*.txt", "Load data in BAPS-format"
-				)
-			} else {
-				pathname_filename <- tietue
-			}
-
-			# ASK: remove?
-			# if ~isempty(partitionCompare)
-			# 	fprintf(1,'Data: %s\n',[pathname filename]);
-			# end
-
-			data <- read.delim(pathname_filename, header = FALSE, sep = " ")
-			data <- as.matrix(data)
-			ninds <- testaaOnkoKunnollinenBapsData(data)  # testing
-			if (ninds == 0) stop('Incorrect Data-file')
-
-			# ASK: remove?
-			# h0 = findobj('Tag','filename1_text');
-			# set(h0,'String',filename); clear h0;
-			message(
-				'When using data which are in BAPS-format, ',
-				'you can specify the sampling populations of the ',
-				'individuals by giving two additional files: ',
-				'one containing the names of the populations, ',
-				'the other containing the indices of the first ',
-				'individuals of the populations.'
-			)
-			input_pops <- inputdlg(
-				prompt = 'Do you wish to specify the sampling populations? [y/N]',
-				definput = 'N'
-			)
-			if (tolower(input_pops) %in% c('yes', 'y')) {
-				popfile    <- uigetfile('*.txt', 'Load population names')
-				kysyToinen <- ifelse(popfile$name == 0, 0, 1)
-				if (kysyToinen == 1) {
-					indicesfile <- uigetfile('*.txt', 'Load population indices')
-					if (indicesfile == 0) {
-						popnames = ""
-					} else {
-						# popnames = initPopNames([namepath namefile],[indicespath indicesfile]) # TODO: translate this fun
-					}
-				} else {
-					popnames <- ""
-				}
-			} else {
-				popnames <- ""
-			}
-
-			temp_handleData <- handleData(data)
-			data <- temp_handleData$newData
-			rowsFromInd <- temp_handleData$rowsFromInd
-			alleleCodes <- temp_handleData$alleleCodes
-			noalle <- temp_handleData$noalle
-			adjprior <- temp_handleData$adjprior
-			priorTerm <- temp_handleData$priorTerm
-			Z_dist <- newGetDistances(data,rowsFromInd)
-			Z <- Z_dist$Z
-			dist <- Z_dist$dist
-			rm(temp_handleData, Z_dist)
-			if (is.null(savePreProcessed)) {
-				save_preproc <- questdlg(
-					quest = 'Do you wish to save pre-processed data?',
-					dlgtitle = 'Save pre-processed data?',
-					defbtn = 'y'
-				)
-			} else {
-				save_preproc <- savePreProcessed
-			}
-			if (save_preproc %in% c('y', 'yes', TRUE)) {
-				file_out      <- uiputfile('.rda','Save pre-processed data as')
-				kokonimi      <- paste0(file_out$path, file_out$name)
-				c             <- list()
-				c$data        <- data
-				c$rowsFromInd <- rowsFromInd
-				c$alleleCodes <- alleleCodes
-				c$noalle      <- noalle
-				c$adjprior    <- adjprior
-				c$priorTerm   <- priorTerm
-				c$dist        <- dist
-				c$popnames    <- popnames
-				c$Z           <- Z
-				save(c, file = kokonimi)
-				rm(c)
-			}
-		} else if (input_type_name == 'GenePop-format') {
-			# ------------------------------------------------------------------
-			# Treating GenePop-formatted files
-			# ------------------------------------------------------------------
-			if (!is(tietue, "character")) {
-				filename_pathname <- uigetfile(
-					filter = '*.txt',
-					title = 'Load data in GenePop-format'
-				)
-				if (filename_pathname$name == 0) stop("No name provided")
-			} else {
-				filename_pathname <- tietue
-			}
-
-			# ASK: remove?
-			# if ~isempty(partitionCompare)
-			# 	fprintf(1,'Data: %s\n',[pathname filename]);
-			# end
-
-			kunnossa <- testaaGenePopData(filename_pathname)
-			if (kunnossa == 0) stop("testaaGenePopData returned 0")
-			data_popnames <- lueGenePopData(filename_pathname)
-			data          <- data_popnames$data
-			popnames      <- data_popnames$popnames
-
-# 			h0 = findobj('Tag','filename1_text');
-# 			set(h0,'String',filename); clear h0;
-
-			list_dranap <- handleData(data)
-			data        <- list_dranap$newData
-			rowsFromInd <- list_dranap$rowsFromInd
-			alleleCodes <- list_dranap$alleleCodes
-			noalle      <- list_dranap$noalle
-			adjprior    <- list_dranap$adjprior
-			priorTerm   <- list_dranap$prioterm
-
-			list_Zd <- newGetDistances(data,rowsFromInd) # FIXME: debug
-			Z       <- list_Zd$Z
-			dist    <- list_Zd$dist
-
-			if (is.null(savePreProcessed)) {
-				save_preproc <- questdlg(
-					quest = 'Do you wish to save pre-processed data?',
-					dlgtitle = 'Save pre-processed data?',
-					defbtn = 'y'
-				)
-			} else {
-				save_preproc <- savePreProcessed
-			}
-			if (save_preproc %in% c('y', 'Yes', TRUE)) {
-				file_out      <- uiputfile('.rda','Save pre-processed data as')
-				kokonimi      <- paste0(file_out$path, file_out$name)
-				# FIXME: translate functions above so the objects below exist
-				c$data        <- data
-				c$rowsFromInd <- rowsFromInd
-				c$alleleCodes <- alleleCodes
-				c$noalle      <- noalle
-				c$adjprior    <- adjprior
-				c$priorTerm   <- priorTerm
-				c$dist        <- dist
-				c$popnames    <- popnames
-				c$Z           <- Z
-				save(c, file = kokonimi)
-				rm(c)
-			}
-		} else if (input_type_name == 'Preprocessed data') {
-			# ------------------------------------------------------------------
-			# Handling Pre-processed data
-			# ------------------------------------------------------------------
-			file_in <- uigetfile(
-				filter = '*.txt',
-				title = 'Load pre-processed data in GenePop-format'
-			)
-			if (file_in$name == 0) stop("No name provided")
-
-			# ASK: remove?
-			# h0 = findobj('Tag','filename1_text');
-			# set(h0,'String',filename); clear h0;
-			# if ~isempty(partitionCompare)
-			# 	fprintf(1,'Data: %s\n',[pathname filename]);
-			# end
-
-			struct_array <- readRDS(paste0(file_in$path, file_in$name))
-			if (isfield(struct_array,'c')) { # Matlab versio
-				c <- struct_array$c
-				if (!isfield(c,'dist')) stop('Incorrect file format')
-			} else if (isfield(struct_array,'dist')) { #Mideva versio
-				c <- struct_array
-			} else {
-				stop('Incorrect file format')
-			}
-			data        <- double(c$data)
-			rowsFromInd <- c$rowsFromInd
-			alleleCodes <- c$alleleCodes
-			noalle      <- c$noalle
-			adjprior    <- c$adjprior
-			priorTerm   <- c$priorTerm
-			dist        <- c$dist
-			popnames    <- c$popnames
-			Z           <- c$Z
-			rm(c)
-		}
-	} else {
-		data        <- double(tietue$data)
-		rowsFromInd <- tietue$rowsFromInd
-		alleleCodes <- tietue$alleleCodes
-		noalle      <- tietue$noalle
-		adjprior    <- tietue$adjprior
-		priorTerm   <- tietue$priorTerm
-		dist        <- tietue$dist
-		popnames    <- tietue$popnames
-		Z           <- tietue$Z
-		rm(tietue)
-	}
-
-	# ==========================================================================
-	# Declaring global variables and changing environment of children functions
-	# ==========================================================================
-	PARTITION       <- vector()
-	COUNTS          <- vector()
-	SUMCOUNTS       <- vector()
-	POP_LOGML       <- vector()
-	clearGlobalVars()
-
-	environment(writeMixtureInfo) <- environment()
-	# ==========================================================================
-	c             <- list()
-	c$data        <- data
-	c$noalle      <- noalle
-	c$adjprior    <- adjprior
-	c$priorTerm   <- priorTerm
-	c$dist        <- dist
-	c$Z           <- Z
-	c$rowsFromInd <- rowsFromInd
-
-	ninds  <- length(unique(data[, ncol(data)]))
-	ekat   <- t(seq(1, ninds, rowsFromInd) * rowsFromInd)
-	c$rows <- t(rbind(ekat, ekat + rowsFromInd - 1))
-
-	# ASK remove?
-	# partition compare
-	# if (!is.null(partitionCompare)) {
-	# 	nsamplingunits <- size(c$rows, 1)
-	# 	partitions     <- partitionCompare$partitions
-	# 	npartitions    <- size(partitions, 2)
-	# 	partitionLogml <- zeros(1, npartitions)
-	# 	for (i in seq_len(npartitions)) {
-	# 		# number of unique partition lables
-	# 		npops <- length(unique(partitions[, i]))
-
-	# 		partitionInd <- zeros(ninds * rowsFromInd, 1)
-	# 		partitionSample <- partitions[, i]
-	# 		for (j in seq_len(nsamplingunits)) {
-	# 			partitionInd[c$rows[j, 1]:c$rows[j, 2]] <- partitionSample[j]
-	# 		}
-	# 		# partitionLogml[i] = initialCounts(
-	# 		# 	partitionInd,
-	# 		# 	data[, seq_len(end - 1)],
-	# 		# 	npops,
-	# 		# 	c$rows,
-	# 		# 	noalle,
-	# 		# 	adjprior
-	# 		# ) #TODO translate
-	# 	}
-	# 	# return the logml result
-	# 	partitionCompare$logmls <- partitionLogml
-	# 	# set(h1, 'userdata', partitionCompare)
-	# 	return()
-	# }
-
-	if (fixedK) {
-		# logml_npops_partitionSummary <- indMix_fixK(c) # TODO: translate
-		# logml <- logml_npops_partitionSummary$logml
-		# npops <- logml_npops_partitionSummary$npops
-		# partitionSummary <- logml_npops_partitionSummary$partitionSummary
-	} else {
-		logml_npops_partitionSummary <- indMix(c)
-		logml <- logml_npops_partitionSummary$logml
-		npops <- logml_npops_partitionSummary$npops
-		partitionSummary <- logml_npops_partitionSummary$partitionSummary
-	}
-	if (logml == 1) {
-		warning("logml == 1")
-		return()
-	}
-
-	data <- data[, seq_len(ncol(data) - 1)]
-
-	# ASK: remove?
-	# h0 = findobj('Tag','filename1_text')
-	# inp = get(h0,'String');
-	# h0 = findobj('Tag','filename2_text')
-	# outp = get(h0,'String');
-	inp <- vector()
-	outp <- vector()
-
-	changesInLogml <- writeMixtureInfo(
-		logml, rowsFromInd, data, adjprior, priorTerm, outp, inp,
-		popnames, fixedK
-	) # FIXME: broken
-
-	# viewMixPartition(PARTITION, popnames) # ASK translate? On graph folder
-
-	talle <- questdlg(
-		quest = paste(
-			'Do you want to save the mixture populations',
-			'so that you can use them later in admixture analysis?'
-		),
-		dlgtitle = 'Save results?',
-		defbtn = 'y'
-	)
-	if (talle %in% c('Yes', 'y')) {
-		filename_pathname <- uiputfile('.mat','Save results as')
-
-		# ======================================================================
-		cond <- (sum(filename_pathname$name) == 0) |
-		(sum(filename_pathname$path) == 0)
-		if (cond) {
-			# Cancel was pressed
-			return()
-		} else {
-			# copy 'baps4_output.baps' into the text file with the same name.
-			if (file.exists('baps4_output.baps')) {
-				file.copy(
-					from = 'baps4_output.baps',
-					to = paste0(
-						filename_pathname$path, filename_pathname$name, '.txt'
-					)
-				)
-				file.remove('baps4_output.baps')
-			}
-		}
-		# ======================================================================
-
-		c$PARTITION      <- PARTITION
-		c$COUNTS         <- COUNTS
-		c$SUMCOUNTS      <- SUMCOUNTS
-		c$alleleCodes    <- alleleCodes
-		c$adjprior       <- adjprior
-		c$popnames       <- popnames
-		c$rowsFromInd    <- rowsFromInd
-		c$data           <- data
-		c$npops          <- npops
-		c$noalle         <- noalle
-		c$mixtureType    <- 'mix'
-		c$logml          <- logml
-		c$changesInLogml <- changesInLogml
-
-		save(c, file = paste0(filename_pathname$path, filename_pathname$name))
-	} else {
-		if (file.exists('baps4_output.baps')) file.remove('baps4_output.baps')
-	}
+	stop(
+		"greedyMix() has been superseded by load_fasta().",
+		" Please change your code to use the latter instead of the former.",
+		" If you believe the error is internal to rBAPS, please open",
+		" a new issue (link on the package DESCRIPTION file)."
+		)
 }
--- a/R/load_fasta.R
+++ b/R/load_fasta.R
@ -0,0 +1,56 @@
+#' load_fasta
+#'
+#' Loads a fasta file into matrix format ready for
+#' running the hierBAPS algorithm.
+#'
+#' @param msa Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered
+#' @param keep.singletons A logical indicating whether to consider singleton mutations in calculating the clusters
+#'
+#' @return A character matrix with filtered SNP data
+#'
+#' @examples
+#' msa <- system.file("extdata", "seqs.fa", package = "rhierbaps")
+#' snp.matrix <- load_fasta(msa)
+#' @export
+load_fasta <- function(msa, keep.singletons=FALSE) {
+
+  #Check inputs
+  if(class(msa)=="character"){
+    if (!file.exists(msa)) stop("Invalid msa or the file does not exist!")
+    seqs <- ape::read.FASTA(msa)
+  } else if(class(msa)=="matrix"){
+    seqs <- ape::as.DNAbin(msa)
+  } else if(class(msa)=="DNAbin"){
+    seqs <- msa
+  } else{
+    stop("incorrect input for msa!")
+  }
+  if (!is.logical(keep.singletons)) stop("Invalid keep.singletons! Must be on of TRUE/FALSE.")
+
+  #Load sequences using ape. This does a lot of the checking for us.
+  seq_names <- labels(seqs)
+  seqs <- as.character(as.matrix(seqs))
+  rownames(seqs) <- seq_names
+  seqs[is.na(seqs)] <- "-"
+
+  if (nrow(seqs)<3) stop("Less than 3 sequences!")
+  warning("Characters not in acgtnACGTN- will be treated as missing (-)...")
+
+  #Remove conserved columns
+  conserved <- colSums(t(t(seqs)==seqs[1,]))==nrow(seqs)
+  seqs <- seqs[, !conserved]
+
+  if(!keep.singletons){
+    #remove singletons as they are uninformative in the algorithm
+    is_singleton <- apply(seqs, 2, function(x){
+      tab <- table(x)
+      return(x %in% names(tab)[tab==1])
+    })
+    seqs[is_singleton] <- "-"
+  }
+
+  #Convert gaps and unknowns to same symbol
+  seqs[seqs=="n"] <- "-"
+
+  return(seqs)
+}
--- a/man/load_fasta.Rd
+++ b/man/load_fasta.Rd
@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/load_fasta.R
+\name{load_fasta}
+\alias{load_fasta}
+\title{load_fasta}
+\usage{
+load_fasta(msa, keep.singletons = FALSE)
+}
+\arguments{
+\item{msa}{Either the location of a fasta file or ape DNAbin object containing the multiple sequence alignment data to be clustered}
+
+\item{keep.singletons}{A logical indicating whether to consider singleton mutations in calculating the clusters}
+}
+\value{
+A character matrix with filtered SNP data
+}
+\description{
+Loads a fasta file into matrix format ready for
+running the hierBAPS algorithm.
+}
+\examples{
+msa <- system.file("extdata", "seqs.fa", package = "rhierbaps")
+snp.matrix <- load_fasta(msa)
+}