From 036ec00bbdbf929d40f6261a3a5b8d70870ab8d0 Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Fri, 13 Sep 2024 13:39:24 +0200 Subject: [PATCH 1/4] Fixed `addAlleles()` --- R/addAlleles.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/addAlleles.R b/R/addAlleles.R index 9854027..451db4e 100644 --- a/R/addAlleles.R +++ b/R/addAlleles.R @@ -11,7 +11,7 @@ addAlleles <- function(data, ind, line, divider) { # line. Jos data on 3 digit formaatissa on divider=1000. # Jos data on 2 digit formaatissa on divider=100. - nloci <- size(data, 2) # added 1 from original code + nloci <- size(data, 2) - 1L if (size(data, 1) < (2 * ind)) { data <- rbind(data, zeros(100, nloci)) # subtracted 1 from original code } @@ -22,8 +22,7 @@ addAlleles <- function(data, ind, line, divider) { k <- k + 1 merkki <- substring(line, k, k) } - line <- substring(line, k + 1) - # clear k; clear merkki; + line <- trimws(substring(line, k + 1)) if (grepl(" ", line)) { alleeliTaulu <- as.numeric(strsplit(line, split = " ")[[1]]) From 00605016d60a985ed9cbb9471d71ea82ee8a0926 Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Fri, 13 Sep 2024 13:51:25 +0200 Subject: [PATCH 2/4] Added `process_GenePop_data()` --- R/process_GenePop_data.R | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 R/process_GenePop_data.R diff --git a/R/process_GenePop_data.R b/R/process_GenePop_data.R new file mode 100644 index 0000000..e444c47 --- /dev/null +++ b/R/process_GenePop_data.R @@ -0,0 +1,24 @@ +process_GenePop_data <- function(filename) { + # Pre-processing GenePop data + kunnossa <- testaaGenePopData(filename) + data_popnames <- lueGenePopData(filename) + data <- data_popnames$data + popnames <- data_popnames$popnames + + # Processing GenePop data + c <- handleData(data, "genepop") + Z_dist <- newGetDistances(c$newData, c$rowsFromInd) + + # Forming and returning pre-processed data + list( + "data" = c$newData, + "rowsFromInd" = c$rowsFromInd, + "alleleCodes" = c$alleleCodes, + "noalle" = c$noalle, + "adjprior" = c$adjprior, + "priorTerm" = c$priorTerm, + "dist" = Z_dist$dist, + "popnames" = popnames, + "Z" = Z_dist$Z + ) +} From 31575a5b9e400626e41765cf823a5a1c28d331dd Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Fri, 13 Sep 2024 13:52:39 +0200 Subject: [PATCH 3/4] Dropped support for GenePop on `importFile()` File should be handled by `process_GenePop_data()`, as per original BAPS. --- R/importFile.R | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/R/importFile.R b/R/importFile.R index 2ed9c75..57f2fee 100644 --- a/R/importFile.R +++ b/R/importFile.R @@ -1,6 +1,5 @@ #' @title Import data file -#' @description Imports data from several formats (FASTA, VCF, SAM, BAM, -#' Genepop). +#' @description Imports data from several formats (FASTA, VCF, SAM, BAM). #' @param data raw dataset #' @param format data format (guesses from extension if not provided) #' @param verbose if \code{TRUE}, prints extra output information @@ -33,15 +32,6 @@ importFile <- function(data, format, verbose) { ) } else if (format == "bam") { out <- Rsamtools::scanBam(data) - } else if (format == "genepop") { - if (toupper(adegenet::.readExt(data)) == "TXT") { - message("Creating a copy of the file with the .gen extension") - dataGen <- gsub("txt", "gen", data) - file.copy(data, dataGen) - out <- adegenet::read.genepop(dataGen) - } else { - out <- adegenet::read.genepop(data) - } } else { stop("Format not supported.") } From 9e554d5d4bc12d348a30537ae082bd0b2896db8e Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Fri, 13 Sep 2024 13:53:01 +0200 Subject: [PATCH 4/4] Aligned processing of Genepop with MATLAB code --- R/greedyMix.R | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/R/greedyMix.R b/R/greedyMix.R index dd0d117..cea602b 100644 --- a/R/greedyMix.R +++ b/R/greedyMix.R @@ -27,6 +27,7 @@ greedyMix <- function( inp = NULL, popnames = NULL, fixedK = FALSE, verbose = FALSE ) { # Importing and handling data ================================================ + # TODO: use format as class and make handling data a generic if (tolower(format) %in% "fasta") { data <- convert_FASTA_to_BAPS(data) format <- "baps" @@ -42,6 +43,17 @@ greedyMix <- function( Z = data[["Z"]], dist = data[["dist"]] ) + } else if (tolower(format) %in% "genepop") { + data <- process_GenePop_data(data) + c <- list( + noalle = data[["noalle"]], + data = data[["data"]], + adjprior = data[["adjprior"]], + priorTerm = data[["priorTerm"]], + rowsFromInd = data[["rowsFromInd"]], + Z = data[["Z"]], + dist = data[["dist"]] + ) } else { data <- importFile(data, format, verbose) data <- handleData(data, tolower(format)) @@ -68,7 +80,7 @@ greedyMix <- function( # Generating partition summary =============================================== ekat <- seq(1L, ninds * c[["rowsFromInd"]], c[["rowsFromInd"]]) c[["rows"]] <- cbind(ekat, ekat + c[["rowsFromInd"]] - 1L) - logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose) # FIXME: not working for FASTA data + logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose) # FIXME: not working for FASTA, GenePop logml <- logml_npops_partitionSummary[["logml"]] npops <- logml_npops_partitionSummary[["npops"]] partitionSummary <- logml_npops_partitionSummary[["partitionSummary"]]