From 12884cfcb6d7d61d40b0255c59f64aa214b1317d Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Mon, 26 Aug 2024 13:23:03 +0200 Subject: [PATCH 1/9] Added GenePop data, txt and gen (#24) --- inst/extdata/GenePop.gen | 12 ++++++++++++ inst/extdata/GenePop.txt | 12 ++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 inst/extdata/GenePop.gen create mode 100644 inst/extdata/GenePop.txt diff --git a/inst/extdata/GenePop.gen b/inst/extdata/GenePop.gen new file mode 100644 index 0000000..9e1874b --- /dev/null +++ b/inst/extdata/GenePop.gen @@ -0,0 +1,12 @@ +Example GenePop Data +Loc1 +Loc2 +Loc3 +Pop +Ind1, 0101 0202 0303 +Ind2, 0102 0201 0303 +Ind3, 0101 0202 0303 +Pop +Ind4, 0101 0202 0303 +Ind5, 0102 0201 0303 +Ind6, 0101 0202 0303 diff --git a/inst/extdata/GenePop.txt b/inst/extdata/GenePop.txt new file mode 100644 index 0000000..9e1874b --- /dev/null +++ b/inst/extdata/GenePop.txt @@ -0,0 +1,12 @@ +Example GenePop Data +Loc1 +Loc2 +Loc3 +Pop +Ind1, 0101 0202 0303 +Ind2, 0102 0201 0303 +Ind3, 0101 0202 0303 +Pop +Ind4, 0101 0202 0303 +Ind5, 0102 0201 0303 +Ind6, 0101 0202 0303 From 54b0b081ea4c11b7c7fb2c17f1b5f9a75f19ec63 Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Mon, 26 Aug 2024 13:26:52 +0200 Subject: [PATCH 2/9] Added unit tests for `greedyMix()` on GenePop (#24) --- tests/testthat/test-greedyMix.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/testthat/test-greedyMix.R b/tests/testthat/test-greedyMix.R index 9353b0a..864223a 100644 --- a/tests/testthat/test-greedyMix.R +++ b/tests/testthat/test-greedyMix.R @@ -66,8 +66,10 @@ test_that("greedyMix() fails successfully", { test_that("greedyMix() works when it should", { baps_file <- file.path(path_inst, "BAPS_clustering_diploid.txt") + genepop_file <- file.path(path_inst, "GenePop.txt") fasta_file <- file.path(path_inst, "FASTA_clustering_haploid.fasta") greedy_baps <- greedyMix(baps_file, "BAPS") + greedy_genepop <- greedyMix(genepop_file, "GenePop") # FIXME: doesn't match MATLAB output expect_type(greedy_baps, "list") expect_length(greedy_baps, 10L) }) From 9e3ea49398f7facc972a167531a29f01ba89bc3c Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Fri, 13 Sep 2024 12:42:46 +0200 Subject: [PATCH 3/9] handleData defaults to BAPS format --- R/handleData.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/handleData.R b/R/handleData.R index fbdd9c9..d8d44eb 100644 --- a/R/handleData.R +++ b/R/handleData.R @@ -10,7 +10,7 @@ #' the function changes the allele codes so that one locus j #' codes get values between? 1, ..., noalle(j). #' @export -handleData <- function(raw_data, format = "Genepop") { +handleData <- function(raw_data, format = "baps") { # Alkuper?isen datan viimeinen sarake kertoo, milt?yksil?lt? # kyseinen rivi on per?isin. Funktio tutkii ensin, ett?montako # rivi?maksimissaan on per?isin yhdelt?yksil?lt? jolloin saadaan From 036ec00bbdbf929d40f6261a3a5b8d70870ab8d0 Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Fri, 13 Sep 2024 13:39:24 +0200 Subject: [PATCH 4/9] Fixed `addAlleles()` --- R/addAlleles.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/addAlleles.R b/R/addAlleles.R index 9854027..451db4e 100644 --- a/R/addAlleles.R +++ b/R/addAlleles.R @@ -11,7 +11,7 @@ addAlleles <- function(data, ind, line, divider) { # line. Jos data on 3 digit formaatissa on divider=1000. # Jos data on 2 digit formaatissa on divider=100. - nloci <- size(data, 2) # added 1 from original code + nloci <- size(data, 2) - 1L if (size(data, 1) < (2 * ind)) { data <- rbind(data, zeros(100, nloci)) # subtracted 1 from original code } @@ -22,8 +22,7 @@ addAlleles <- function(data, ind, line, divider) { k <- k + 1 merkki <- substring(line, k, k) } - line <- substring(line, k + 1) - # clear k; clear merkki; + line <- trimws(substring(line, k + 1)) if (grepl(" ", line)) { alleeliTaulu <- as.numeric(strsplit(line, split = " ")[[1]]) From 00605016d60a985ed9cbb9471d71ea82ee8a0926 Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Fri, 13 Sep 2024 13:51:25 +0200 Subject: [PATCH 5/9] Added `process_GenePop_data()` --- R/process_GenePop_data.R | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 R/process_GenePop_data.R diff --git a/R/process_GenePop_data.R b/R/process_GenePop_data.R new file mode 100644 index 0000000..e444c47 --- /dev/null +++ b/R/process_GenePop_data.R @@ -0,0 +1,24 @@ +process_GenePop_data <- function(filename) { + # Pre-processing GenePop data + kunnossa <- testaaGenePopData(filename) + data_popnames <- lueGenePopData(filename) + data <- data_popnames$data + popnames <- data_popnames$popnames + + # Processing GenePop data + c <- handleData(data, "genepop") + Z_dist <- newGetDistances(c$newData, c$rowsFromInd) + + # Forming and returning pre-processed data + list( + "data" = c$newData, + "rowsFromInd" = c$rowsFromInd, + "alleleCodes" = c$alleleCodes, + "noalle" = c$noalle, + "adjprior" = c$adjprior, + "priorTerm" = c$priorTerm, + "dist" = Z_dist$dist, + "popnames" = popnames, + "Z" = Z_dist$Z + ) +} From 31575a5b9e400626e41765cf823a5a1c28d331dd Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Fri, 13 Sep 2024 13:52:39 +0200 Subject: [PATCH 6/9] Dropped support for GenePop on `importFile()` File should be handled by `process_GenePop_data()`, as per original BAPS. --- R/importFile.R | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/R/importFile.R b/R/importFile.R index 2ed9c75..57f2fee 100644 --- a/R/importFile.R +++ b/R/importFile.R @@ -1,6 +1,5 @@ #' @title Import data file -#' @description Imports data from several formats (FASTA, VCF, SAM, BAM, -#' Genepop). +#' @description Imports data from several formats (FASTA, VCF, SAM, BAM). #' @param data raw dataset #' @param format data format (guesses from extension if not provided) #' @param verbose if \code{TRUE}, prints extra output information @@ -33,15 +32,6 @@ importFile <- function(data, format, verbose) { ) } else if (format == "bam") { out <- Rsamtools::scanBam(data) - } else if (format == "genepop") { - if (toupper(adegenet::.readExt(data)) == "TXT") { - message("Creating a copy of the file with the .gen extension") - dataGen <- gsub("txt", "gen", data) - file.copy(data, dataGen) - out <- adegenet::read.genepop(dataGen) - } else { - out <- adegenet::read.genepop(data) - } } else { stop("Format not supported.") } From 9e554d5d4bc12d348a30537ae082bd0b2896db8e Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Fri, 13 Sep 2024 13:53:01 +0200 Subject: [PATCH 7/9] Aligned processing of Genepop with MATLAB code --- R/greedyMix.R | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/R/greedyMix.R b/R/greedyMix.R index dd0d117..cea602b 100644 --- a/R/greedyMix.R +++ b/R/greedyMix.R @@ -27,6 +27,7 @@ greedyMix <- function( inp = NULL, popnames = NULL, fixedK = FALSE, verbose = FALSE ) { # Importing and handling data ================================================ + # TODO: use format as class and make handling data a generic if (tolower(format) %in% "fasta") { data <- convert_FASTA_to_BAPS(data) format <- "baps" @@ -42,6 +43,17 @@ greedyMix <- function( Z = data[["Z"]], dist = data[["dist"]] ) + } else if (tolower(format) %in% "genepop") { + data <- process_GenePop_data(data) + c <- list( + noalle = data[["noalle"]], + data = data[["data"]], + adjprior = data[["adjprior"]], + priorTerm = data[["priorTerm"]], + rowsFromInd = data[["rowsFromInd"]], + Z = data[["Z"]], + dist = data[["dist"]] + ) } else { data <- importFile(data, format, verbose) data <- handleData(data, tolower(format)) @@ -68,7 +80,7 @@ greedyMix <- function( # Generating partition summary =============================================== ekat <- seq(1L, ninds * c[["rowsFromInd"]], c[["rowsFromInd"]]) c[["rows"]] <- cbind(ekat, ekat + c[["rowsFromInd"]] - 1L) - logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose) # FIXME: not working for FASTA data + logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose) # FIXME: not working for FASTA, GenePop logml <- logml_npops_partitionSummary[["logml"]] npops <- logml_npops_partitionSummary[["npops"]] partitionSummary <- logml_npops_partitionSummary[["partitionSummary"]] From b4fe83482284ec4e392e37933d97236060b18e1a Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Fri, 13 Sep 2024 14:19:25 +0200 Subject: [PATCH 8/9] Temp-wrapping disfunctional code --- tests/testthat/test-greedyMix.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-greedyMix.R b/tests/testthat/test-greedyMix.R index 864223a..2c80ca3 100644 --- a/tests/testthat/test-greedyMix.R +++ b/tests/testthat/test-greedyMix.R @@ -69,7 +69,7 @@ test_that("greedyMix() works when it should", { genepop_file <- file.path(path_inst, "GenePop.txt") fasta_file <- file.path(path_inst, "FASTA_clustering_haploid.fasta") greedy_baps <- greedyMix(baps_file, "BAPS") - greedy_genepop <- greedyMix(genepop_file, "GenePop") # FIXME: doesn't match MATLAB output + expect_error(greedy_genepop <- greedyMix(genepop_file, "GenePop")) # TEMP: fails expect_type(greedy_baps, "list") expect_length(greedy_baps, 10L) }) From aa675d1804d00b036837ed588dc598f0f66810f8 Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Fri, 13 Sep 2024 14:23:11 +0200 Subject: [PATCH 9/9] Updated docs --- man/handleData.Rd | 2 +- man/importFile.Rd | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/man/handleData.Rd b/man/handleData.Rd index 4a20a8c..6dd4c90 100644 --- a/man/handleData.Rd +++ b/man/handleData.Rd @@ -4,7 +4,7 @@ \alias{handleData} \title{Handle Data} \usage{ -handleData(raw_data, format = "Genepop") +handleData(raw_data, format = "baps") } \arguments{ \item{raw_data}{Raw data in Genepop or BAPS format} diff --git a/man/importFile.Rd b/man/importFile.Rd index 9adb584..2ad7e3b 100644 --- a/man/importFile.Rd +++ b/man/importFile.Rd @@ -17,8 +17,7 @@ importFile(data, format, verbose) The data in a format that can be used by the other functions } \description{ -Imports data from several formats (FASTA, VCF, SAM, BAM, -Genepop). +Imports data from several formats (FASTA, VCF, SAM, BAM). } \examples{ path_inst <- system.file("extdata", "", package = "rBAPS")