diff --git a/DESCRIPTION b/DESCRIPTION index 4a28dcf..7ed5904 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: rBAPS Title: Bayesian Analysis of Population Structure -Version: 0.0.0.9024 +Version: 0.0.0.9025 Date: 2020-11-09 Authors@R: c( @@ -36,7 +36,7 @@ Description: Partial R implementation of the BAPS software License: GPL-3 BugReports: https://github.com/ocbe-uio/rBAPS/issues Encoding: UTF-8 -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Suggests: testthat (>= 2.1.0) Imports: diff --git a/NAMESPACE b/NAMESPACE index a0b3bf9..09db974 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -38,5 +38,6 @@ importFrom(methods,is) importFrom(stats,runif) importFrom(stats,sd) importFrom(utils,read.delim) +importFrom(utils,read.table) importFrom(vcfR,read.vcfR) importFrom(zeallot,"%<-%") diff --git a/R/greedyMix.R b/R/greedyMix.R index aa74a5a..13e9a6d 100644 --- a/R/greedyMix.R +++ b/R/greedyMix.R @@ -2,7 +2,6 @@ #' @param data data file #' @param format Data format. Format supported: "FASTA", "VCF" ,"BAM", "GenePop" #' @param partitionCompare a list of partitions to compare -#' @param ninds number of individuals #' @param npops number of populations #' @param counts counts #' @param sumcounts sumcounts @@ -20,25 +19,46 @@ #' with high-throughput sequencing data. #' @export #' @examples -#' data <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS") -#' greedyMix(data, "fasta") +#' \dontrun{ # TEMP: unwrap once #24 is resolved +#' data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS") +#' greedyMix(data, "baps") +#' } # TEMP: unwrap once #24 is resolved greedyMix <- function( - data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, ninds = 1L, npops = 1L, + data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, npops = 1L, counts = NULL, sumcounts = NULL, max_iter = 100L, alleleCodes = NULL, inp = NULL, popnames = NULL, fixedK = FALSE, verbose = FALSE ) { # Importing and handling data ================================================ - data <- importFile(data, format, verbose) - data <- handleData(data, tolower(format)) - c <- list( - noalle = data[["noalle"]], - data = data[["newData"]], - adjprior = data[["adjprior"]], - priorTerm = data[["priorTerm"]], - rowsFromInd = data[["rowsFromInd"]] - ) + if (tolower(format) %in% "fasta") { + stop("FASTA format not yet supported on greedyMix") + } + if (tolower(format) %in% "baps") { + data <- process_BAPS_data(data, NULL) + c <- list( + noalle = data[["noalle"]], + data = data[["data"]], + adjprior = data[["adjprior"]], + priorTerm = data[["priorTerm"]], + rowsFromInd = data[["rowsFromInd"]], + Z = data[["Z"]], + dist = data[["dist"]] + ) + } else { + data <- importFile(data, format, verbose) + data <- handleData(data, tolower(format)) + c <- list( + noalle = data[["noalle"]], + data = data[["newData"]], + adjprior = data[["adjprior"]], + priorTerm = data[["priorTerm"]], + rowsFromInd = data[["rowsFromInd"]], + Z = data[["Z"]], + dist = data[["dist"]] + ) + } # Comparing partitions ======================================================= + ninds <- length(unique(c[["data"]][, ncol(c[["data"]])])) if (!is.null(partitionCompare)) { logmls <- comparePartitions( c[["data"]], nrow(c[["data"]]), partitionCompare[["partitions"]], ninds, @@ -46,10 +66,9 @@ greedyMix <- function( ) } - # Generating partition summary =============================================== - ekat <- seq(1L, c[["rowsFromInd"]], ninds * c[["rowsFromInd"]]) # ekat = (1:rowsFromInd:ninds*rowsFromInd)'; - c[["rows"]] <- c(ekat, ekat + c[["rowsFromInd"]] - 1L) # c.rows = [ekat ekat+rowsFromInd-1] + ekat <- seq(1L, ninds * c[["rowsFromInd"]], c[["rowsFromInd"]]) + c[["rows"]] <- cbind(ekat, ekat + c[["rowsFromInd"]] - 1L) logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose) logml <- logml_npops_partitionSummary[["logml"]] npops <- logml_npops_partitionSummary[["npops"]] diff --git a/R/handleData.R b/R/handleData.R index 6b4a209..8868fc2 100644 --- a/R/handleData.R +++ b/R/handleData.R @@ -30,17 +30,17 @@ handleData <- function(raw_data, format = "Genepop") { "bam" = stop("BAM format not supported for processing yet") ) data <- as.matrix(raw_data) - dataApu <- data[, 1:nloci] + dataApu <- data[, seq_len(nloci)] nollat <- matlab2r::find(dataApu == 0) if (!isempty(nollat)) { isoinAlleeli <- base::max(base::max(dataApu)) dataApu[nollat] <- isoinAlleeli + 1 - data[, 1:nloci] <- dataApu + data[, seq_len(nloci)] <- dataApu } noalle <- zeros(1, nloci) alleelitLokuksessa <- cell(nloci, 1, expandable = TRUE) - for (i in 1:nloci) { + for (i in seq_len(nloci)) { alleelitLokuksessaI <- unique(data[, i]) alleelitLokuksessa[[i]] <- sort(alleelitLokuksessaI[ matlab2r::find(alleelitLokuksessaI >= 0) @@ -48,7 +48,7 @@ handleData <- function(raw_data, format = "Genepop") { noalle[i] <- length(alleelitLokuksessa[[i]]) } alleleCodes <- zeros(base::max(noalle), nloci) - for (i in 1:nloci) { + for (i in seq_len(nloci)) { alleelitLokuksessaI <- alleelitLokuksessa[[i]] puuttuvia <- base::max(noalle) - length(alleelitLokuksessaI) alleleCodes[, i] <- as.matrix(c(alleelitLokuksessaI, zeros(puuttuvia, 1))) @@ -83,7 +83,7 @@ handleData <- function(raw_data, format = "Genepop") { adjprior <- zeros(base::max(noalle), nloci) priorTerm <- 0 - for (j in 1:nloci) { + for (j in seq_len(nloci)) { adjprior[, j] <- as.matrix(c( repmat(1 / noalle[j], c(noalle[j], 1)), ones(base::max(noalle) - noalle[j], 1) diff --git a/R/indMix.R b/R/indMix.R index 6d2a1ad..f30a9e9 100644 --- a/R/indMix.R +++ b/R/indMix.R @@ -21,11 +21,9 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d Z <- NULL } - - rm(c) - nargin <- length(as.list(match.call())) - 1 - if (nargin < 2) { - dispText <- 1 + if (missing(npops)) { + # Recreate npopsTaulu from objects other than npops + dispText <- TRUE npopstext <- matrix() ready <- FALSE teksti <- "Input upper bound to the number of populations (possibly multiple values)" @@ -75,7 +73,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d partitionSummary[, 1] <- zeros(30, 1) worstLogml <- -1e50 worstIndex <- 1 - for (run in seq_along(nruns)) { + for (run in seq_len(nruns)) { npops <- npopsTaulu[[run]] if (dispText) { dispLine() @@ -110,7 +108,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d # PARHAAN MIXTURE-PARTITION ETSIMINEN nRoundTypes <- 7 kokeiltu <- zeros(nRoundTypes, 1) - roundTypes <- c(1, 1) # Ykk�svaiheen sykli kahteen kertaan. + roundTypes <- t(c(1, 1)) # Ykk�svaiheen sykli kahteen kertaan. ready <- 0 vaihe <- 1 @@ -143,9 +141,9 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d if (kokeiltu[round] == 1) { # Askelta kokeiltu viime muutoksen j�lkeen } else if (round == 0 | round == 1) { # Yksil�n siirt�minen toiseen populaatioon. inds <- seq_len(ninds) - aputaulu <- cbind(t(inds), rand(ninds, 1)) + aputaulu <- cbind(inds, rand(ninds, 1)) aputaulu <- matrix(sortrows(aputaulu, 2), nrow = nrow(aputaulu)) - inds <- t(aputaulu[, 1]) + inds <- aputaulu[, 1] muutosNyt <- 0 for (ind in inds) { @@ -324,7 +322,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d if (round == 5) { aputaulu <- c(inds, rand(length(inds), 1)) aputaulu <- sortrows(aputaulu, 2) - inds <- t(aputaulu[, 1]) + inds <- aputaulu[, 1] } else if (round == 6) { inds <- returnInOrder( inds, pop, rows, data, adjprior, priorTerm @@ -525,15 +523,15 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d if (ready == 0) { if (vaihe == 1) { - roundTypes <- 1 + roundTypes <- t(1) } else if (vaihe == 2) { - roundTypes <- c(2, 1) + roundTypes <- t(c(2, 1)) } else if (vaihe == 3) { - roundTypes <- c(5, 5, 7) + roundTypes <- t(c(5, 5, 7)) } else if (vaihe == 4) { - roundTypes <- c(4, 3, 1) + roundTypes <- t(c(4, 3, 1)) } else if (vaihe == 5) { - roundTypes <- c(6, 7, 2, 3, 4, 1) + roundTypes <- t(c(6, 7, 2, 3, 4, 1)) } } } diff --git a/R/initialCounts.R b/R/initialCounts.R index 5d7188c..7328f64 100644 --- a/R/initialCounts.R +++ b/R/initialCounts.R @@ -7,8 +7,8 @@ initialCounts <- function(partition, data, npops, rows, noalle, adjprior) { counts <- zeros(base::max(noalle), nloci, npops) sumcounts <- zeros(npops, nloci) - for (i in 1:npops) { - for (j in 1:nloci) { + for (i in seq_len(npops)) { + for (j in seq_len(nloci)) { havainnotLokuksessa <- matlab2r::find(partition == i & data[, j] >= 0) sumcounts[i, j] <- length(havainnotLokuksessa) for (k in 1:noalle[j]) { diff --git a/R/linkage.R b/R/linkage.R index d030ff1..9648f85 100644 --- a/R/linkage.R +++ b/R/linkage.R @@ -26,11 +26,11 @@ linkage <- function(Y, method = "co") { monotonic <- 1 Z <- zeros(m - 1, 3) # allocate the output matrix. N <- zeros(1, 2 * m - 1) - N[1:m] <- 1 + N[seq_len(m)] <- 1 n <- m # since m is changing, we need to save m in n. - R <- 1:n + R <- seq_len(n) for (s in 1:(n - 1)) { - X <- as.matrix(as.vector(Y), ncol = 1) + X <- as.matrix(as.vector(Y), nrow = 1) v <- matlab2r::min(X)$mins k <- matlab2r::min(X)$idx @@ -83,11 +83,14 @@ linkage <- function(Y, method = "co") { ) J <- c(J, i * (m - (i + 1) / 2) - m + j) Y <- Y[-J] # no need for the cluster information about j + # update m, N, R m <- m - 1 N[n + s] <- N[R[i]] + N[R[j]] R[i] <- n + s - R[j:(n - 1)] <- R[(j + 1):n] + if (j < n) { + R[j:(n - 1)] <- R[(j + 1):n] + } } return(Z) } diff --git a/R/logml2String.R b/R/logml2String.R index 09ef88b..784812a 100644 --- a/R/logml2String.R +++ b/R/logml2String.R @@ -1,7 +1,6 @@ #' @title Logml to string #' @description Returns a string representation of a logml #' @param logml input Logml -#' @param #' @param leading_zeros_replacement string to replace leading zeros with #' @return String version of logml logml2String <- function(logml, leading_zeros_replacement = " ") { diff --git a/R/newGetDistances.R b/R/newGetDistances.R index cf62722..ef12041 100644 --- a/R/newGetDistances.R +++ b/R/newGetDistances.R @@ -35,7 +35,7 @@ newGetDistances <- function(data, rowsFromInd) { y <- zeros(size(toka)) y <- apply(y, 2, as.integer) - for (j in 1:nloci) { + for (j in seq_len(nloci)) { for (k in 1:rowsFromInd) { x[, k] <- data[eka[, k], j] y[, k] <- data[toka[, k], j] diff --git a/R/process_BAPS_data.R b/R/process_BAPS_data.R new file mode 100644 index 0000000..7a76bf6 --- /dev/null +++ b/R/process_BAPS_data.R @@ -0,0 +1,37 @@ +process_BAPS_data <- function(file, partitionCompare) { + if (!is.null(partitionCompare)) { + cat('Data:', file, '\n') + } + data <- read.table(file) + ninds <- testaaOnkoKunnollinenBapsData(data) # for testing purposes? + if (ninds == 0) { + warning('Incorrect Data-file.') + return(NULL) + } + popnames <- NULL # Dropped specification of population names (from BAPS 6) + + result <- handleData(data, format = "BAPS") + data <- result$newData + rowsFromInd <- result$rowsFromInd + alleleCodes <- result$alleleCodes + noalle <- result$noalle + adjprior <- result$adjprior + priorTerm <- result$priorTerm + result <- newGetDistances(data, rowsFromInd) + Z <- result$Z + dist <- result$dist + + # Forming and saving pre-processed data + processed_data <- list( + data = data, + rowsFromInd = rowsFromInd, + alleleCodes = alleleCodes, + noalle = noalle, + adjprior = adjprior, + priorTerm = priorTerm, + dist = dist, + popnames = popnames, + Z = Z + ) + return(processed_data) +} diff --git a/R/rBAPS-package.R b/R/rBAPS-package.R index 810a2f1..c58245a 100644 --- a/R/rBAPS-package.R +++ b/R/rBAPS-package.R @@ -1,6 +1,5 @@ #' @title Bayesian Analysis of Population Structure #' @description This is a partial implementation of the BAPS software -#' @docType package #' @name rBAPS #' @note Found a bug? Want to suggest a feature? Contribute to the scientific #' and open source communities by opening an issue on our home page. @@ -11,4 +10,5 @@ #' @importFrom stats runif #' @importFrom zeallot %<-% #' @importFrom matlab2r nargin log2 -NULL +#' @importFrom utils read.table +"_PACKAGE" diff --git a/man/greedyMix.Rd b/man/greedyMix.Rd index e10fa87..f9c58a6 100644 --- a/man/greedyMix.Rd +++ b/man/greedyMix.Rd @@ -8,7 +8,6 @@ greedyMix( data, format = gsub("^.*\\\\.", "", data), partitionCompare = NULL, - ninds = 1L, npops = 1L, counts = NULL, sumcounts = NULL, @@ -27,8 +26,6 @@ greedyMix( \item{partitionCompare}{a list of partitions to compare} -\item{ninds}{number of individuals} - \item{npops}{number of populations} \item{counts}{counts} @@ -51,8 +48,10 @@ greedyMix( Clustering of individuals } \examples{ -data <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS") -greedyMix(data, "fasta") +\dontrun{ # TEMP: unwrap once #24 is resolved +data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS") +greedyMix(data, "baps") +} # TEMP: unwrap once #24 is resolved } \references{ Samtools: a suite of programs for interacting diff --git a/man/rBAPS.Rd b/man/rBAPS.Rd index cd92bd6..89040f3 100644 --- a/man/rBAPS.Rd +++ b/man/rBAPS.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/rBAPS-package.R \docType{package} \name{rBAPS} +\alias{rBAPS-package} \alias{rBAPS} \title{Bayesian Analysis of Population Structure} \description{ @@ -12,3 +13,20 @@ Found a bug? Want to suggest a feature? Contribute to the scientific and open source communities by opening an issue on our home page. Check the "BugReports" field on the package description for the URL. } +\seealso{ +Useful links: +\itemize{ + \item Report bugs at \url{https://github.com/ocbe-uio/rBAPS/issues} +} + +} +\author{ +\strong{Maintainer}: Waldir Leoncio \email{w.l.netto@medisin.uio.no} + +Authors: +\itemize{ + \item Jukka Corander \email{jukka.corander@medisin.uio.no} + \item Gerry Tonkin-Hill \email{jukka.corander@medisin.uio.no} +} + +} diff --git a/tests/testthat/test-greedyMix.R b/tests/testthat/test-greedyMix.R index 7e6da02..20380af 100644 --- a/tests/testthat/test-greedyMix.R +++ b/tests/testthat/test-greedyMix.R @@ -57,12 +57,15 @@ test_that("Files are imported correctly", { ) ) expect_equal(length(raw_bam[[1]]), 13) + expect_error( + greedyMix( + data = file.path(path_inst, "FASTA_clustering_haploid.fasta"), + format = "FASTA" + ), + "FASTA format not yet supported on greedyMix" + ) }) -df_fasta <- greedyMix( - data = file.path(path_inst, "FASTA_clustering_haploid.fasta"), - format = "FASTA" -) test_that("greedyMix() works", { expect_error(greedyMix(file.path(path_inst, "vcf_example.vcf"))) expect_error(greedyMix(file.path(path_inst, "bam_example.bam")))