Merge branch 'baps-format' into develop

* baps-format:
  Increment version number to 0.0.0.9025
  Skips `greedyMix()` example until #24 is fixed
  Updated documentation (#24)
  Fails for FAST input to `greedyMix()` (#24)
  Fixed calculation og `ninds` (#24)
  Bugfixes to `indMix()` and subfunctions (#24)
  Improved handling of BAPS format (#24)
  Fixed calculation of `ninds` `ekat` (#24)
  Added processing of BAPS format
This commit is contained in:
Waldir Leoncio 2024-04-08 13:35:51 +02:00
commit 9b8da3ab4d
14 changed files with 134 additions and 57 deletions

View file

@ -1,6 +1,6 @@
Package: rBAPS Package: rBAPS
Title: Bayesian Analysis of Population Structure Title: Bayesian Analysis of Population Structure
Version: 0.0.0.9024 Version: 0.0.0.9025
Date: 2020-11-09 Date: 2020-11-09
Authors@R: Authors@R:
c( c(
@ -36,7 +36,7 @@ Description: Partial R implementation of the BAPS software
License: GPL-3 License: GPL-3
BugReports: https://github.com/ocbe-uio/rBAPS/issues BugReports: https://github.com/ocbe-uio/rBAPS/issues
Encoding: UTF-8 Encoding: UTF-8
RoxygenNote: 7.2.3 RoxygenNote: 7.3.1
Suggests: Suggests:
testthat (>= 2.1.0) testthat (>= 2.1.0)
Imports: Imports:

View file

@ -38,5 +38,6 @@ importFrom(methods,is)
importFrom(stats,runif) importFrom(stats,runif)
importFrom(stats,sd) importFrom(stats,sd)
importFrom(utils,read.delim) importFrom(utils,read.delim)
importFrom(utils,read.table)
importFrom(vcfR,read.vcfR) importFrom(vcfR,read.vcfR)
importFrom(zeallot,"%<-%") importFrom(zeallot,"%<-%")

View file

@ -2,7 +2,6 @@
#' @param data data file #' @param data data file
#' @param format Data format. Format supported: "FASTA", "VCF" ,"BAM", "GenePop" #' @param format Data format. Format supported: "FASTA", "VCF" ,"BAM", "GenePop"
#' @param partitionCompare a list of partitions to compare #' @param partitionCompare a list of partitions to compare
#' @param ninds number of individuals
#' @param npops number of populations #' @param npops number of populations
#' @param counts counts #' @param counts counts
#' @param sumcounts sumcounts #' @param sumcounts sumcounts
@ -20,25 +19,46 @@
#' with high-throughput sequencing data. <http://www.htslib.org/> #' with high-throughput sequencing data. <http://www.htslib.org/>
#' @export #' @export
#' @examples #' @examples
#' data <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS") #' \dontrun{ # TEMP: unwrap once #24 is resolved
#' greedyMix(data, "fasta") #' data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS")
#' greedyMix(data, "baps")
#' } # TEMP: unwrap once #24 is resolved
greedyMix <- function( greedyMix <- function(
data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, ninds = 1L, npops = 1L, data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, npops = 1L,
counts = NULL, sumcounts = NULL, max_iter = 100L, alleleCodes = NULL, counts = NULL, sumcounts = NULL, max_iter = 100L, alleleCodes = NULL,
inp = NULL, popnames = NULL, fixedK = FALSE, verbose = FALSE inp = NULL, popnames = NULL, fixedK = FALSE, verbose = FALSE
) { ) {
# Importing and handling data ================================================ # Importing and handling data ================================================
data <- importFile(data, format, verbose) if (tolower(format) %in% "fasta") {
data <- handleData(data, tolower(format)) stop("FASTA format not yet supported on greedyMix")
c <- list( }
noalle = data[["noalle"]], if (tolower(format) %in% "baps") {
data = data[["newData"]], data <- process_BAPS_data(data, NULL)
adjprior = data[["adjprior"]], c <- list(
priorTerm = data[["priorTerm"]], noalle = data[["noalle"]],
rowsFromInd = data[["rowsFromInd"]] data = data[["data"]],
) adjprior = data[["adjprior"]],
priorTerm = data[["priorTerm"]],
rowsFromInd = data[["rowsFromInd"]],
Z = data[["Z"]],
dist = data[["dist"]]
)
} else {
data <- importFile(data, format, verbose)
data <- handleData(data, tolower(format))
c <- list(
noalle = data[["noalle"]],
data = data[["newData"]],
adjprior = data[["adjprior"]],
priorTerm = data[["priorTerm"]],
rowsFromInd = data[["rowsFromInd"]],
Z = data[["Z"]],
dist = data[["dist"]]
)
}
# Comparing partitions ======================================================= # Comparing partitions =======================================================
ninds <- length(unique(c[["data"]][, ncol(c[["data"]])]))
if (!is.null(partitionCompare)) { if (!is.null(partitionCompare)) {
logmls <- comparePartitions( logmls <- comparePartitions(
c[["data"]], nrow(c[["data"]]), partitionCompare[["partitions"]], ninds, c[["data"]], nrow(c[["data"]]), partitionCompare[["partitions"]], ninds,
@ -46,10 +66,9 @@ greedyMix <- function(
) )
} }
# Generating partition summary =============================================== # Generating partition summary ===============================================
ekat <- seq(1L, c[["rowsFromInd"]], ninds * c[["rowsFromInd"]]) # ekat = (1:rowsFromInd:ninds*rowsFromInd)'; ekat <- seq(1L, ninds * c[["rowsFromInd"]], c[["rowsFromInd"]])
c[["rows"]] <- c(ekat, ekat + c[["rowsFromInd"]] - 1L) # c.rows = [ekat ekat+rowsFromInd-1] c[["rows"]] <- cbind(ekat, ekat + c[["rowsFromInd"]] - 1L)
logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose) logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose)
logml <- logml_npops_partitionSummary[["logml"]] logml <- logml_npops_partitionSummary[["logml"]]
npops <- logml_npops_partitionSummary[["npops"]] npops <- logml_npops_partitionSummary[["npops"]]

View file

@ -30,17 +30,17 @@ handleData <- function(raw_data, format = "Genepop") {
"bam" = stop("BAM format not supported for processing yet") "bam" = stop("BAM format not supported for processing yet")
) )
data <- as.matrix(raw_data) data <- as.matrix(raw_data)
dataApu <- data[, 1:nloci] dataApu <- data[, seq_len(nloci)]
nollat <- matlab2r::find(dataApu == 0) nollat <- matlab2r::find(dataApu == 0)
if (!isempty(nollat)) { if (!isempty(nollat)) {
isoinAlleeli <- base::max(base::max(dataApu)) isoinAlleeli <- base::max(base::max(dataApu))
dataApu[nollat] <- isoinAlleeli + 1 dataApu[nollat] <- isoinAlleeli + 1
data[, 1:nloci] <- dataApu data[, seq_len(nloci)] <- dataApu
} }
noalle <- zeros(1, nloci) noalle <- zeros(1, nloci)
alleelitLokuksessa <- cell(nloci, 1, expandable = TRUE) alleelitLokuksessa <- cell(nloci, 1, expandable = TRUE)
for (i in 1:nloci) { for (i in seq_len(nloci)) {
alleelitLokuksessaI <- unique(data[, i]) alleelitLokuksessaI <- unique(data[, i])
alleelitLokuksessa[[i]] <- sort(alleelitLokuksessaI[ alleelitLokuksessa[[i]] <- sort(alleelitLokuksessaI[
matlab2r::find(alleelitLokuksessaI >= 0) matlab2r::find(alleelitLokuksessaI >= 0)
@ -48,7 +48,7 @@ handleData <- function(raw_data, format = "Genepop") {
noalle[i] <- length(alleelitLokuksessa[[i]]) noalle[i] <- length(alleelitLokuksessa[[i]])
} }
alleleCodes <- zeros(base::max(noalle), nloci) alleleCodes <- zeros(base::max(noalle), nloci)
for (i in 1:nloci) { for (i in seq_len(nloci)) {
alleelitLokuksessaI <- alleelitLokuksessa[[i]] alleelitLokuksessaI <- alleelitLokuksessa[[i]]
puuttuvia <- base::max(noalle) - length(alleelitLokuksessaI) puuttuvia <- base::max(noalle) - length(alleelitLokuksessaI)
alleleCodes[, i] <- as.matrix(c(alleelitLokuksessaI, zeros(puuttuvia, 1))) alleleCodes[, i] <- as.matrix(c(alleelitLokuksessaI, zeros(puuttuvia, 1)))
@ -83,7 +83,7 @@ handleData <- function(raw_data, format = "Genepop") {
adjprior <- zeros(base::max(noalle), nloci) adjprior <- zeros(base::max(noalle), nloci)
priorTerm <- 0 priorTerm <- 0
for (j in 1:nloci) { for (j in seq_len(nloci)) {
adjprior[, j] <- as.matrix(c( adjprior[, j] <- as.matrix(c(
repmat(1 / noalle[j], c(noalle[j], 1)), repmat(1 / noalle[j], c(noalle[j], 1)),
ones(base::max(noalle) - noalle[j], 1) ones(base::max(noalle) - noalle[j], 1)

View file

@ -21,11 +21,9 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
Z <- NULL Z <- NULL
} }
if (missing(npops)) {
rm(c) # Recreate npopsTaulu from objects other than npops
nargin <- length(as.list(match.call())) - 1 dispText <- TRUE
if (nargin < 2) {
dispText <- 1
npopstext <- matrix() npopstext <- matrix()
ready <- FALSE ready <- FALSE
teksti <- "Input upper bound to the number of populations (possibly multiple values)" teksti <- "Input upper bound to the number of populations (possibly multiple values)"
@ -75,7 +73,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
partitionSummary[, 1] <- zeros(30, 1) partitionSummary[, 1] <- zeros(30, 1)
worstLogml <- -1e50 worstLogml <- -1e50
worstIndex <- 1 worstIndex <- 1
for (run in seq_along(nruns)) { for (run in seq_len(nruns)) {
npops <- npopsTaulu[[run]] npops <- npopsTaulu[[run]]
if (dispText) { if (dispText) {
dispLine() dispLine()
@ -110,7 +108,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
# PARHAAN MIXTURE-PARTITION ETSIMINEN # PARHAAN MIXTURE-PARTITION ETSIMINEN
nRoundTypes <- 7 nRoundTypes <- 7
kokeiltu <- zeros(nRoundTypes, 1) kokeiltu <- zeros(nRoundTypes, 1)
roundTypes <- c(1, 1) # Ykk<6B>svaiheen sykli kahteen kertaan. roundTypes <- t(c(1, 1)) # Ykk<6B>svaiheen sykli kahteen kertaan.
ready <- 0 ready <- 0
vaihe <- 1 vaihe <- 1
@ -143,9 +141,9 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
if (kokeiltu[round] == 1) { # Askelta kokeiltu viime muutoksen j<>lkeen if (kokeiltu[round] == 1) { # Askelta kokeiltu viime muutoksen j<>lkeen
} else if (round == 0 | round == 1) { # Yksil<69>n siirt<72>minen toiseen populaatioon. } else if (round == 0 | round == 1) { # Yksil<69>n siirt<72>minen toiseen populaatioon.
inds <- seq_len(ninds) inds <- seq_len(ninds)
aputaulu <- cbind(t(inds), rand(ninds, 1)) aputaulu <- cbind(inds, rand(ninds, 1))
aputaulu <- matrix(sortrows(aputaulu, 2), nrow = nrow(aputaulu)) aputaulu <- matrix(sortrows(aputaulu, 2), nrow = nrow(aputaulu))
inds <- t(aputaulu[, 1]) inds <- aputaulu[, 1]
muutosNyt <- 0 muutosNyt <- 0
for (ind in inds) { for (ind in inds) {
@ -324,7 +322,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
if (round == 5) { if (round == 5) {
aputaulu <- c(inds, rand(length(inds), 1)) aputaulu <- c(inds, rand(length(inds), 1))
aputaulu <- sortrows(aputaulu, 2) aputaulu <- sortrows(aputaulu, 2)
inds <- t(aputaulu[, 1]) inds <- aputaulu[, 1]
} else if (round == 6) { } else if (round == 6) {
inds <- returnInOrder( inds <- returnInOrder(
inds, pop, rows, data, adjprior, priorTerm inds, pop, rows, data, adjprior, priorTerm
@ -525,15 +523,15 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
if (ready == 0) { if (ready == 0) {
if (vaihe == 1) { if (vaihe == 1) {
roundTypes <- 1 roundTypes <- t(1)
} else if (vaihe == 2) { } else if (vaihe == 2) {
roundTypes <- c(2, 1) roundTypes <- t(c(2, 1))
} else if (vaihe == 3) { } else if (vaihe == 3) {
roundTypes <- c(5, 5, 7) roundTypes <- t(c(5, 5, 7))
} else if (vaihe == 4) { } else if (vaihe == 4) {
roundTypes <- c(4, 3, 1) roundTypes <- t(c(4, 3, 1))
} else if (vaihe == 5) { } else if (vaihe == 5) {
roundTypes <- c(6, 7, 2, 3, 4, 1) roundTypes <- t(c(6, 7, 2, 3, 4, 1))
} }
} }
} }

View file

@ -7,8 +7,8 @@ initialCounts <- function(partition, data, npops, rows, noalle, adjprior) {
counts <- zeros(base::max(noalle), nloci, npops) counts <- zeros(base::max(noalle), nloci, npops)
sumcounts <- zeros(npops, nloci) sumcounts <- zeros(npops, nloci)
for (i in 1:npops) { for (i in seq_len(npops)) {
for (j in 1:nloci) { for (j in seq_len(nloci)) {
havainnotLokuksessa <- matlab2r::find(partition == i & data[, j] >= 0) havainnotLokuksessa <- matlab2r::find(partition == i & data[, j] >= 0)
sumcounts[i, j] <- length(havainnotLokuksessa) sumcounts[i, j] <- length(havainnotLokuksessa)
for (k in 1:noalle[j]) { for (k in 1:noalle[j]) {

View file

@ -26,11 +26,11 @@ linkage <- function(Y, method = "co") {
monotonic <- 1 monotonic <- 1
Z <- zeros(m - 1, 3) # allocate the output matrix. Z <- zeros(m - 1, 3) # allocate the output matrix.
N <- zeros(1, 2 * m - 1) N <- zeros(1, 2 * m - 1)
N[1:m] <- 1 N[seq_len(m)] <- 1
n <- m # since m is changing, we need to save m in n. n <- m # since m is changing, we need to save m in n.
R <- 1:n R <- seq_len(n)
for (s in 1:(n - 1)) { for (s in 1:(n - 1)) {
X <- as.matrix(as.vector(Y), ncol = 1) X <- as.matrix(as.vector(Y), nrow = 1)
v <- matlab2r::min(X)$mins v <- matlab2r::min(X)$mins
k <- matlab2r::min(X)$idx k <- matlab2r::min(X)$idx
@ -83,11 +83,14 @@ linkage <- function(Y, method = "co") {
) )
J <- c(J, i * (m - (i + 1) / 2) - m + j) J <- c(J, i * (m - (i + 1) / 2) - m + j)
Y <- Y[-J] # no need for the cluster information about j Y <- Y[-J] # no need for the cluster information about j
# update m, N, R # update m, N, R
m <- m - 1 m <- m - 1
N[n + s] <- N[R[i]] + N[R[j]] N[n + s] <- N[R[i]] + N[R[j]]
R[i] <- n + s R[i] <- n + s
R[j:(n - 1)] <- R[(j + 1):n] if (j < n) {
R[j:(n - 1)] <- R[(j + 1):n]
}
} }
return(Z) return(Z)
} }

View file

@ -1,7 +1,6 @@
#' @title Logml to string #' @title Logml to string
#' @description Returns a string representation of a logml #' @description Returns a string representation of a logml
#' @param logml input Logml #' @param logml input Logml
#' @param
#' @param leading_zeros_replacement string to replace leading zeros with #' @param leading_zeros_replacement string to replace leading zeros with
#' @return String version of logml #' @return String version of logml
logml2String <- function(logml, leading_zeros_replacement = " ") { logml2String <- function(logml, leading_zeros_replacement = " ") {

View file

@ -35,7 +35,7 @@ newGetDistances <- function(data, rowsFromInd) {
y <- zeros(size(toka)) y <- zeros(size(toka))
y <- apply(y, 2, as.integer) y <- apply(y, 2, as.integer)
for (j in 1:nloci) { for (j in seq_len(nloci)) {
for (k in 1:rowsFromInd) { for (k in 1:rowsFromInd) {
x[, k] <- data[eka[, k], j] x[, k] <- data[eka[, k], j]
y[, k] <- data[toka[, k], j] y[, k] <- data[toka[, k], j]

37
R/process_BAPS_data.R Normal file
View file

@ -0,0 +1,37 @@
process_BAPS_data <- function(file, partitionCompare) {
if (!is.null(partitionCompare)) {
cat('Data:', file, '\n')
}
data <- read.table(file)
ninds <- testaaOnkoKunnollinenBapsData(data) # for testing purposes?
if (ninds == 0) {
warning('Incorrect Data-file.')
return(NULL)
}
popnames <- NULL # Dropped specification of population names (from BAPS 6)
result <- handleData(data, format = "BAPS")
data <- result$newData
rowsFromInd <- result$rowsFromInd
alleleCodes <- result$alleleCodes
noalle <- result$noalle
adjprior <- result$adjprior
priorTerm <- result$priorTerm
result <- newGetDistances(data, rowsFromInd)
Z <- result$Z
dist <- result$dist
# Forming and saving pre-processed data
processed_data <- list(
data = data,
rowsFromInd = rowsFromInd,
alleleCodes = alleleCodes,
noalle = noalle,
adjprior = adjprior,
priorTerm = priorTerm,
dist = dist,
popnames = popnames,
Z = Z
)
return(processed_data)
}

View file

@ -1,6 +1,5 @@
#' @title Bayesian Analysis of Population Structure #' @title Bayesian Analysis of Population Structure
#' @description This is a partial implementation of the BAPS software #' @description This is a partial implementation of the BAPS software
#' @docType package
#' @name rBAPS #' @name rBAPS
#' @note Found a bug? Want to suggest a feature? Contribute to the scientific #' @note Found a bug? Want to suggest a feature? Contribute to the scientific
#' and open source communities by opening an issue on our home page. #' and open source communities by opening an issue on our home page.
@ -11,4 +10,5 @@
#' @importFrom stats runif #' @importFrom stats runif
#' @importFrom zeallot %<-% #' @importFrom zeallot %<-%
#' @importFrom matlab2r nargin log2 #' @importFrom matlab2r nargin log2
NULL #' @importFrom utils read.table
"_PACKAGE"

View file

@ -8,7 +8,6 @@ greedyMix(
data, data,
format = gsub("^.*\\\\.", "", data), format = gsub("^.*\\\\.", "", data),
partitionCompare = NULL, partitionCompare = NULL,
ninds = 1L,
npops = 1L, npops = 1L,
counts = NULL, counts = NULL,
sumcounts = NULL, sumcounts = NULL,
@ -27,8 +26,6 @@ greedyMix(
\item{partitionCompare}{a list of partitions to compare} \item{partitionCompare}{a list of partitions to compare}
\item{ninds}{number of individuals}
\item{npops}{number of populations} \item{npops}{number of populations}
\item{counts}{counts} \item{counts}{counts}
@ -51,8 +48,10 @@ greedyMix(
Clustering of individuals Clustering of individuals
} }
\examples{ \examples{
data <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS") \dontrun{ # TEMP: unwrap once #24 is resolved
greedyMix(data, "fasta") data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS")
greedyMix(data, "baps")
} # TEMP: unwrap once #24 is resolved
} }
\references{ \references{
Samtools: a suite of programs for interacting Samtools: a suite of programs for interacting

View file

@ -2,6 +2,7 @@
% Please edit documentation in R/rBAPS-package.R % Please edit documentation in R/rBAPS-package.R
\docType{package} \docType{package}
\name{rBAPS} \name{rBAPS}
\alias{rBAPS-package}
\alias{rBAPS} \alias{rBAPS}
\title{Bayesian Analysis of Population Structure} \title{Bayesian Analysis of Population Structure}
\description{ \description{
@ -12,3 +13,20 @@ Found a bug? Want to suggest a feature? Contribute to the scientific
and open source communities by opening an issue on our home page. and open source communities by opening an issue on our home page.
Check the "BugReports" field on the package description for the URL. Check the "BugReports" field on the package description for the URL.
} }
\seealso{
Useful links:
\itemize{
\item Report bugs at \url{https://github.com/ocbe-uio/rBAPS/issues}
}
}
\author{
\strong{Maintainer}: Waldir Leoncio \email{w.l.netto@medisin.uio.no}
Authors:
\itemize{
\item Jukka Corander \email{jukka.corander@medisin.uio.no}
\item Gerry Tonkin-Hill \email{jukka.corander@medisin.uio.no}
}
}

View file

@ -57,12 +57,15 @@ test_that("Files are imported correctly", {
) )
) )
expect_equal(length(raw_bam[[1]]), 13) expect_equal(length(raw_bam[[1]]), 13)
expect_error(
greedyMix(
data = file.path(path_inst, "FASTA_clustering_haploid.fasta"),
format = "FASTA"
),
"FASTA format not yet supported on greedyMix"
)
}) })
df_fasta <- greedyMix(
data = file.path(path_inst, "FASTA_clustering_haploid.fasta"),
format = "FASTA"
)
test_that("greedyMix() works", { test_that("greedyMix() works", {
expect_error(greedyMix(file.path(path_inst, "vcf_example.vcf"))) expect_error(greedyMix(file.path(path_inst, "vcf_example.vcf")))
expect_error(greedyMix(file.path(path_inst, "bam_example.bam"))) expect_error(greedyMix(file.path(path_inst, "bam_example.bam")))