Merge branch 'baps-format' into develop

* baps-format: Increment version number to 0.0.0.9025 Skips `greedyMix()` example until #24 is fixed Updated documentation (#24) Fails for FAST input to `greedyMix()` (#24) Fixed calculation og `ninds` (#24) Bugfixes to `indMix()` and subfunctions (#24) Improved handling of BAPS format (#24) Fixed calculation of `ninds` `ekat` (#24) Added processing of BAPS format
2024-04-08 13:35:51 +02:00 · 2024-04-08 13:35:51 +02:00 · 9b8da3ab4d
commit 9b8da3ab4d
parent d2ec5894b3 6d875dfd8d
14 changed files with 134 additions and 57 deletions
--- a/4
+++ b/4
@ -1,6 +1,6 @@
 Package: rBAPS
 Title: Bayesian Analysis of Population Structure
-Version: 0.0.0.9024
+Version: 0.0.0.9025
 Date: 2020-11-09
 Authors@R:
    c(
@ -36,7 +36,7 @@ Description: Partial R implementation of the BAPS software
 License: GPL-3
 BugReports: https://github.com/ocbe-uio/rBAPS/issues
 Encoding: UTF-8
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Suggests:
    testthat (>= 2.1.0)
 Imports:
--- a/1
+++ b/1
@ -38,5 +38,6 @@ importFrom(methods,is)
 importFrom(stats,runif)
 importFrom(stats,sd)
 importFrom(utils,read.delim)
+importFrom(utils,read.table)
 importFrom(vcfR,read.vcfR)
 importFrom(zeallot,"%<-%")
--- a/R/greedyMix.R
+++ b/R/greedyMix.R
@ -2,7 +2,6 @@
 #' @param data data file
 #' @param format Data format. Format supported: "FASTA", "VCF" ,"BAM", "GenePop"
 #' @param partitionCompare a list of partitions to compare
-#' @param ninds number of individuals
 #' @param npops number of populations
 #' @param counts counts
 #' @param sumcounts sumcounts
@ -20,25 +19,46 @@
 #' with high-throughput sequencing data. <http://www.htslib.org/>
 #' @export
 #' @examples
-#' data <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS")
-#' greedyMix(data, "fasta")
+#' \dontrun{ # TEMP: unwrap once #24 is resolved
+#' data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS")
+#' greedyMix(data, "baps")
+#' } # TEMP: unwrap once #24 is resolved
 greedyMix <- function(
-  data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, ninds = 1L, npops = 1L,
+  data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, npops = 1L,
  counts = NULL, sumcounts = NULL, max_iter = 100L, alleleCodes = NULL,
  inp = NULL, popnames = NULL, fixedK = FALSE, verbose = FALSE
 ) {
  # Importing and handling data ================================================
-  data <- importFile(data, format, verbose)
-  data <- handleData(data, tolower(format))
-  c <- list(
-    noalle = data[["noalle"]],
-    data = data[["newData"]],
-    adjprior = data[["adjprior"]],
-    priorTerm = data[["priorTerm"]],
-    rowsFromInd = data[["rowsFromInd"]]
-  )
+  if (tolower(format) %in% "fasta") {
+    stop("FASTA format not yet supported on greedyMix")
+  }
+  if (tolower(format) %in% "baps") {
+    data <- process_BAPS_data(data, NULL)
+    c <- list(
+      noalle = data[["noalle"]],
+      data = data[["data"]],
+      adjprior = data[["adjprior"]],
+      priorTerm = data[["priorTerm"]],
+      rowsFromInd = data[["rowsFromInd"]],
+      Z = data[["Z"]],
+      dist = data[["dist"]]
+    )
+  } else {
+    data <- importFile(data, format, verbose)
+    data <- handleData(data, tolower(format))
+    c <- list(
+      noalle = data[["noalle"]],
+      data = data[["newData"]],
+      adjprior = data[["adjprior"]],
+      priorTerm = data[["priorTerm"]],
+      rowsFromInd = data[["rowsFromInd"]],
+      Z = data[["Z"]],
+      dist = data[["dist"]]
+    )
+  }

  # Comparing partitions =======================================================
+  ninds <- length(unique(c[["data"]][, ncol(c[["data"]])]))
  if (!is.null(partitionCompare)) {
    logmls <- comparePartitions(
      c[["data"]], nrow(c[["data"]]), partitionCompare[["partitions"]], ninds,
@ -46,10 +66,9 @@ greedyMix <- function(
    )
  }

-
  # Generating partition summary ===============================================
-  ekat <- seq(1L, c[["rowsFromInd"]], ninds * c[["rowsFromInd"]]) # ekat = (1:rowsFromInd:ninds*rowsFromInd)';
-  c[["rows"]] <- c(ekat, ekat + c[["rowsFromInd"]] - 1L) # c.rows = [ekat ekat+rowsFromInd-1]
+  ekat <- seq(1L, ninds * c[["rowsFromInd"]], c[["rowsFromInd"]])
+  c[["rows"]] <- cbind(ekat, ekat + c[["rowsFromInd"]] - 1L)
  logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose)
  logml <- logml_npops_partitionSummary[["logml"]]
  npops <- logml_npops_partitionSummary[["npops"]]
--- a/R/handleData.R
+++ b/R/handleData.R
@ -30,17 +30,17 @@ handleData <- function(raw_data, format = "Genepop") {
    "bam"     = stop("BAM format not supported for processing yet")
  )
  data <- as.matrix(raw_data)
-  dataApu <- data[, 1:nloci]
+  dataApu <- data[, seq_len(nloci)]
  nollat <- matlab2r::find(dataApu == 0)
  if (!isempty(nollat)) {
    isoinAlleeli <- base::max(base::max(dataApu))
    dataApu[nollat] <- isoinAlleeli + 1
-    data[, 1:nloci] <- dataApu
+    data[, seq_len(nloci)] <- dataApu
  }

  noalle <- zeros(1, nloci)
  alleelitLokuksessa <- cell(nloci, 1, expandable = TRUE)
-  for (i in 1:nloci) {
+  for (i in seq_len(nloci)) {
    alleelitLokuksessaI <- unique(data[, i])
    alleelitLokuksessa[[i]] <- sort(alleelitLokuksessaI[
      matlab2r::find(alleelitLokuksessaI >= 0)
@ -48,7 +48,7 @@ handleData <- function(raw_data, format = "Genepop") {
    noalle[i] <- length(alleelitLokuksessa[[i]])
  }
  alleleCodes <- zeros(base::max(noalle), nloci)
-  for (i in 1:nloci) {
+  for (i in seq_len(nloci)) {
    alleelitLokuksessaI <- alleelitLokuksessa[[i]]
    puuttuvia <- base::max(noalle) - length(alleelitLokuksessaI)
    alleleCodes[, i] <- as.matrix(c(alleelitLokuksessaI, zeros(puuttuvia, 1)))
@ -83,7 +83,7 @@ handleData <- function(raw_data, format = "Genepop") {

  adjprior <- zeros(base::max(noalle), nloci)
  priorTerm <- 0
-  for (j in 1:nloci) {
+  for (j in seq_len(nloci)) {
    adjprior[, j] <- as.matrix(c(
      repmat(1 / noalle[j], c(noalle[j], 1)),
      ones(base::max(noalle) - noalle[j], 1)
--- a/R/indMix.R
+++ b/R/indMix.R
@ -21,11 +21,9 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
    Z <- NULL
  }

-
-  rm(c)
-  nargin <- length(as.list(match.call())) - 1
-  if (nargin < 2) {
-    dispText <- 1
+  if (missing(npops)) {
+    # Recreate npopsTaulu from objects other than npops
+    dispText <- TRUE
    npopstext <- matrix()
    ready <- FALSE
    teksti <- "Input upper bound to the number of populations (possibly multiple values)"
@ -75,7 +73,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
  partitionSummary[, 1] <- zeros(30, 1)
  worstLogml <- -1e50
  worstIndex <- 1
-  for (run in seq_along(nruns)) {
+  for (run in seq_len(nruns)) {
    npops <- npopsTaulu[[run]]
    if (dispText) {
      dispLine()
@ -110,7 +108,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
    # PARHAAN MIXTURE-PARTITION ETSIMINEN
    nRoundTypes <- 7
    kokeiltu <- zeros(nRoundTypes, 1)
-    roundTypes <- c(1, 1) # Ykk<6B>svaiheen sykli kahteen kertaan.
+    roundTypes <- t(c(1, 1)) # Ykk<6B>svaiheen sykli kahteen kertaan.
    ready <- 0
    vaihe <- 1

@ -143,9 +141,9 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
        if (kokeiltu[round] == 1) { # Askelta kokeiltu viime muutoksen j<>lkeen
        } else if (round == 0 | round == 1) { # Yksil<69>n siirt<72>minen toiseen populaatioon.
          inds <- seq_len(ninds)
-          aputaulu <- cbind(t(inds), rand(ninds, 1))
+          aputaulu <- cbind(inds, rand(ninds, 1))
          aputaulu <- matrix(sortrows(aputaulu, 2), nrow = nrow(aputaulu))
-          inds <- t(aputaulu[, 1])
+          inds <- aputaulu[, 1]
          muutosNyt <- 0

          for (ind in inds) {
@ -324,7 +322,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
            if (round == 5) {
              aputaulu <- c(inds, rand(length(inds), 1))
              aputaulu <- sortrows(aputaulu, 2)
-              inds <- t(aputaulu[, 1])
+              inds <- aputaulu[, 1]
            } else if (round == 6) {
              inds <- returnInOrder(
                inds, pop, rows, data, adjprior, priorTerm
@ -525,15 +523,15 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d

      if (ready == 0) {
        if (vaihe == 1) {
-          roundTypes <- 1
+          roundTypes <- t(1)
        } else if (vaihe == 2) {
-          roundTypes <- c(2, 1)
+          roundTypes <- t(c(2, 1))
        } else if (vaihe == 3) {
-          roundTypes <- c(5, 5, 7)
+          roundTypes <- t(c(5, 5, 7))
        } else if (vaihe == 4) {
-          roundTypes <- c(4, 3, 1)
+          roundTypes <- t(c(4, 3, 1))
        } else if (vaihe == 5) {
-          roundTypes <- c(6, 7, 2, 3, 4, 1)
+          roundTypes <- t(c(6, 7, 2, 3, 4, 1))
        }
      }
    }
--- a/R/initialCounts.R
+++ b/R/initialCounts.R
@ -7,8 +7,8 @@ initialCounts <- function(partition, data, npops, rows, noalle, adjprior) {

  counts <- zeros(base::max(noalle), nloci, npops)
  sumcounts <- zeros(npops, nloci)
-  for (i in 1:npops) {
-    for (j in 1:nloci) {
+  for (i in seq_len(npops)) {
+    for (j in seq_len(nloci)) {
      havainnotLokuksessa <- matlab2r::find(partition == i & data[, j] >= 0)
      sumcounts[i, j] <- length(havainnotLokuksessa)
      for (k in 1:noalle[j]) {
--- a/R/linkage.R
+++ b/R/linkage.R
@ -26,11 +26,11 @@ linkage <- function(Y, method = "co") {
  monotonic <- 1
  Z <- zeros(m - 1, 3) # allocate the output matrix.
  N <- zeros(1, 2 * m - 1)
-  N[1:m] <- 1
+  N[seq_len(m)] <- 1
  n <- m # since m is changing, we need to save m in n.
-  R <- 1:n
+  R <- seq_len(n)
  for (s in 1:(n - 1)) {
-    X <- as.matrix(as.vector(Y), ncol = 1)
+    X <- as.matrix(as.vector(Y), nrow = 1)
    v <- matlab2r::min(X)$mins
    k <- matlab2r::min(X)$idx

@ -83,11 +83,14 @@ linkage <- function(Y, method = "co") {
    )
    J <- c(J, i * (m - (i + 1) / 2) - m + j)
    Y <- Y[-J] # no need for the cluster information about j
+
    # update m, N, R
    m <- m - 1
    N[n + s] <- N[R[i]] + N[R[j]]
    R[i] <- n + s
-    R[j:(n - 1)] <- R[(j + 1):n]
+    if (j < n) {
+      R[j:(n - 1)] <- R[(j + 1):n]
+    }
  }
  return(Z)
 }
--- a/R/logml2String.R
+++ b/R/logml2String.R
@ -1,7 +1,6 @@
 #' @title Logml to string
 #' @description Returns a string representation of a logml
 #' @param logml input Logml
-#' @param
 #' @param leading_zeros_replacement string to replace leading zeros with
 #' @return String version of logml
 logml2String <- function(logml, leading_zeros_replacement = " ") {
--- a/R/newGetDistances.R
+++ b/R/newGetDistances.R
@ -35,7 +35,7 @@ newGetDistances <- function(data, rowsFromInd) {
  y <- zeros(size(toka))
  y <- apply(y, 2, as.integer)

-  for (j in 1:nloci) {
+  for (j in seq_len(nloci)) {
    for (k in 1:rowsFromInd) {
      x[, k] <- data[eka[, k], j]
      y[, k] <- data[toka[, k], j]
--- a/R/process_BAPS_data.R
+++ b/R/process_BAPS_data.R
@ -0,0 +1,37 @@
+process_BAPS_data <- function(file, partitionCompare) {
+  if (!is.null(partitionCompare)) {
+    cat('Data:', file, '\n')
+  }
+  data <- read.table(file)
+  ninds <- testaaOnkoKunnollinenBapsData(data)  # for testing purposes?
+  if (ninds == 0) {
+    warning('Incorrect Data-file.')
+    return(NULL)
+  }
+  popnames <- NULL  # Dropped specification of population names (from BAPS 6)
+
+  result <- handleData(data, format = "BAPS")
+  data <- result$newData
+  rowsFromInd <- result$rowsFromInd
+  alleleCodes <- result$alleleCodes
+  noalle <- result$noalle
+  adjprior <- result$adjprior
+  priorTerm <- result$priorTerm
+  result <- newGetDistances(data, rowsFromInd)
+  Z <- result$Z
+  dist <- result$dist
+
+  # Forming and saving pre-processed data
+  processed_data <- list(
+    data = data,
+    rowsFromInd = rowsFromInd,
+    alleleCodes = alleleCodes,
+    noalle = noalle,
+    adjprior = adjprior,
+    priorTerm = priorTerm,
+    dist = dist,
+    popnames = popnames,
+    Z = Z
+  )
+  return(processed_data)
+}
--- a/R/rBAPS-package.R
+++ b/R/rBAPS-package.R
@ -1,6 +1,5 @@
 #' @title Bayesian Analysis of Population Structure
 #' @description This is a partial implementation of the BAPS software
-#' @docType package
 #' @name rBAPS
 #' @note Found a bug? Want to suggest a feature? Contribute to the scientific
 #' and open source communities by opening an issue on our home page.
@ -11,4 +10,5 @@
 #' @importFrom stats runif
 #' @importFrom zeallot %<-%
 #' @importFrom matlab2r nargin log2
-NULL
+#' @importFrom utils read.table
+"_PACKAGE"
--- a/man/greedyMix.Rd
+++ b/man/greedyMix.Rd
@ -8,7 +8,6 @@ greedyMix(
  data,
  format = gsub("^.*\\\\.", "", data),
  partitionCompare = NULL,
-  ninds = 1L,
  npops = 1L,
  counts = NULL,
  sumcounts = NULL,
@ -27,8 +26,6 @@ greedyMix(

 \item{partitionCompare}{a list of partitions to compare}

-\item{ninds}{number of individuals}
-
 \item{npops}{number of populations}

 \item{counts}{counts}
@ -51,8 +48,10 @@ greedyMix(
 Clustering of individuals
 }
 \examples{
-data <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS")
-greedyMix(data, "fasta")
+\dontrun{ # TEMP: unwrap once #24 is resolved
+data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS")
+greedyMix(data, "baps")
+} # TEMP: unwrap once #24 is resolved
 }
 \references{
 Samtools: a suite of programs for interacting
--- a/man/rBAPS.Rd
+++ b/man/rBAPS.Rd
@ -2,6 +2,7 @@
 % Please edit documentation in R/rBAPS-package.R
 \docType{package}
 \name{rBAPS}
+\alias{rBAPS-package}
 \alias{rBAPS}
 \title{Bayesian Analysis of Population Structure}
 \description{
@ -12,3 +13,20 @@ Found a bug? Want to suggest a feature? Contribute to the scientific
 and open source communities by opening an issue on our home page.
 Check the "BugReports" field on the package description for the URL.
 }
+\seealso{
+Useful links:
+\itemize{
+  \item Report bugs at \url{https://github.com/ocbe-uio/rBAPS/issues}
+}
+
+}
+\author{
+\strong{Maintainer}: Waldir Leoncio \email{w.l.netto@medisin.uio.no}
+
+Authors:
+\itemize{
+  \item Jukka Corander \email{jukka.corander@medisin.uio.no}
+  \item Gerry Tonkin-Hill \email{jukka.corander@medisin.uio.no}
+}
+
+}
--- a/tests/testthat/test-greedyMix.R
+++ b/tests/testthat/test-greedyMix.R
@ -57,12 +57,15 @@ test_that("Files are imported correctly", {
    )
  )
  expect_equal(length(raw_bam[[1]]), 13)
+  expect_error(
+    greedyMix(
+      data   = file.path(path_inst, "FASTA_clustering_haploid.fasta"),
+      format = "FASTA"
+    ),
+    "FASTA format not yet supported on greedyMix"
+  )
 })

-df_fasta <- greedyMix(
-  data   = file.path(path_inst, "FASTA_clustering_haploid.fasta"),
-  format = "FASTA"
-)
 test_that("greedyMix() works", {
  expect_error(greedyMix(file.path(path_inst, "vcf_example.vcf")))
  expect_error(greedyMix(file.path(path_inst, "bam_example.bam")))