Merge branch 'baps-format' into develop

* baps-format:
  Increment version number to 0.0.0.9025
  Skips `greedyMix()` example until #24 is fixed
  Updated documentation (#24)
  Fails for FAST input to `greedyMix()` (#24)
  Fixed calculation og `ninds` (#24)
  Bugfixes to `indMix()` and subfunctions (#24)
  Improved handling of BAPS format (#24)
  Fixed calculation of `ninds` `ekat` (#24)
  Added processing of BAPS format
This commit is contained in:
Waldir Leoncio 2024-04-08 13:35:51 +02:00
commit 9b8da3ab4d
14 changed files with 134 additions and 57 deletions

View file

@ -1,6 +1,6 @@
Package: rBAPS
Title: Bayesian Analysis of Population Structure
Version: 0.0.0.9024
Version: 0.0.0.9025
Date: 2020-11-09
Authors@R:
c(
@ -36,7 +36,7 @@ Description: Partial R implementation of the BAPS software
License: GPL-3
BugReports: https://github.com/ocbe-uio/rBAPS/issues
Encoding: UTF-8
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
Suggests:
testthat (>= 2.1.0)
Imports:

View file

@ -38,5 +38,6 @@ importFrom(methods,is)
importFrom(stats,runif)
importFrom(stats,sd)
importFrom(utils,read.delim)
importFrom(utils,read.table)
importFrom(vcfR,read.vcfR)
importFrom(zeallot,"%<-%")

View file

@ -2,7 +2,6 @@
#' @param data data file
#' @param format Data format. Format supported: "FASTA", "VCF" ,"BAM", "GenePop"
#' @param partitionCompare a list of partitions to compare
#' @param ninds number of individuals
#' @param npops number of populations
#' @param counts counts
#' @param sumcounts sumcounts
@ -20,25 +19,46 @@
#' with high-throughput sequencing data. <http://www.htslib.org/>
#' @export
#' @examples
#' data <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS")
#' greedyMix(data, "fasta")
#' \dontrun{ # TEMP: unwrap once #24 is resolved
#' data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS")
#' greedyMix(data, "baps")
#' } # TEMP: unwrap once #24 is resolved
greedyMix <- function(
data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, ninds = 1L, npops = 1L,
data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, npops = 1L,
counts = NULL, sumcounts = NULL, max_iter = 100L, alleleCodes = NULL,
inp = NULL, popnames = NULL, fixedK = FALSE, verbose = FALSE
) {
# Importing and handling data ================================================
data <- importFile(data, format, verbose)
data <- handleData(data, tolower(format))
c <- list(
noalle = data[["noalle"]],
data = data[["newData"]],
adjprior = data[["adjprior"]],
priorTerm = data[["priorTerm"]],
rowsFromInd = data[["rowsFromInd"]]
)
if (tolower(format) %in% "fasta") {
stop("FASTA format not yet supported on greedyMix")
}
if (tolower(format) %in% "baps") {
data <- process_BAPS_data(data, NULL)
c <- list(
noalle = data[["noalle"]],
data = data[["data"]],
adjprior = data[["adjprior"]],
priorTerm = data[["priorTerm"]],
rowsFromInd = data[["rowsFromInd"]],
Z = data[["Z"]],
dist = data[["dist"]]
)
} else {
data <- importFile(data, format, verbose)
data <- handleData(data, tolower(format))
c <- list(
noalle = data[["noalle"]],
data = data[["newData"]],
adjprior = data[["adjprior"]],
priorTerm = data[["priorTerm"]],
rowsFromInd = data[["rowsFromInd"]],
Z = data[["Z"]],
dist = data[["dist"]]
)
}
# Comparing partitions =======================================================
ninds <- length(unique(c[["data"]][, ncol(c[["data"]])]))
if (!is.null(partitionCompare)) {
logmls <- comparePartitions(
c[["data"]], nrow(c[["data"]]), partitionCompare[["partitions"]], ninds,
@ -46,10 +66,9 @@ greedyMix <- function(
)
}
# Generating partition summary ===============================================
ekat <- seq(1L, c[["rowsFromInd"]], ninds * c[["rowsFromInd"]]) # ekat = (1:rowsFromInd:ninds*rowsFromInd)';
c[["rows"]] <- c(ekat, ekat + c[["rowsFromInd"]] - 1L) # c.rows = [ekat ekat+rowsFromInd-1]
ekat <- seq(1L, ninds * c[["rowsFromInd"]], c[["rowsFromInd"]])
c[["rows"]] <- cbind(ekat, ekat + c[["rowsFromInd"]] - 1L)
logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose)
logml <- logml_npops_partitionSummary[["logml"]]
npops <- logml_npops_partitionSummary[["npops"]]

View file

@ -30,17 +30,17 @@ handleData <- function(raw_data, format = "Genepop") {
"bam" = stop("BAM format not supported for processing yet")
)
data <- as.matrix(raw_data)
dataApu <- data[, 1:nloci]
dataApu <- data[, seq_len(nloci)]
nollat <- matlab2r::find(dataApu == 0)
if (!isempty(nollat)) {
isoinAlleeli <- base::max(base::max(dataApu))
dataApu[nollat] <- isoinAlleeli + 1
data[, 1:nloci] <- dataApu
data[, seq_len(nloci)] <- dataApu
}
noalle <- zeros(1, nloci)
alleelitLokuksessa <- cell(nloci, 1, expandable = TRUE)
for (i in 1:nloci) {
for (i in seq_len(nloci)) {
alleelitLokuksessaI <- unique(data[, i])
alleelitLokuksessa[[i]] <- sort(alleelitLokuksessaI[
matlab2r::find(alleelitLokuksessaI >= 0)
@ -48,7 +48,7 @@ handleData <- function(raw_data, format = "Genepop") {
noalle[i] <- length(alleelitLokuksessa[[i]])
}
alleleCodes <- zeros(base::max(noalle), nloci)
for (i in 1:nloci) {
for (i in seq_len(nloci)) {
alleelitLokuksessaI <- alleelitLokuksessa[[i]]
puuttuvia <- base::max(noalle) - length(alleelitLokuksessaI)
alleleCodes[, i] <- as.matrix(c(alleelitLokuksessaI, zeros(puuttuvia, 1)))
@ -83,7 +83,7 @@ handleData <- function(raw_data, format = "Genepop") {
adjprior <- zeros(base::max(noalle), nloci)
priorTerm <- 0
for (j in 1:nloci) {
for (j in seq_len(nloci)) {
adjprior[, j] <- as.matrix(c(
repmat(1 / noalle[j], c(noalle[j], 1)),
ones(base::max(noalle) - noalle[j], 1)

View file

@ -21,11 +21,9 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
Z <- NULL
}
rm(c)
nargin <- length(as.list(match.call())) - 1
if (nargin < 2) {
dispText <- 1
if (missing(npops)) {
# Recreate npopsTaulu from objects other than npops
dispText <- TRUE
npopstext <- matrix()
ready <- FALSE
teksti <- "Input upper bound to the number of populations (possibly multiple values)"
@ -75,7 +73,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
partitionSummary[, 1] <- zeros(30, 1)
worstLogml <- -1e50
worstIndex <- 1
for (run in seq_along(nruns)) {
for (run in seq_len(nruns)) {
npops <- npopsTaulu[[run]]
if (dispText) {
dispLine()
@ -110,7 +108,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
# PARHAAN MIXTURE-PARTITION ETSIMINEN
nRoundTypes <- 7
kokeiltu <- zeros(nRoundTypes, 1)
roundTypes <- c(1, 1) # Ykk<6B>svaiheen sykli kahteen kertaan.
roundTypes <- t(c(1, 1)) # Ykk<6B>svaiheen sykli kahteen kertaan.
ready <- 0
vaihe <- 1
@ -143,9 +141,9 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
if (kokeiltu[round] == 1) { # Askelta kokeiltu viime muutoksen j<>lkeen
} else if (round == 0 | round == 1) { # Yksil<69>n siirt<72>minen toiseen populaatioon.
inds <- seq_len(ninds)
aputaulu <- cbind(t(inds), rand(ninds, 1))
aputaulu <- cbind(inds, rand(ninds, 1))
aputaulu <- matrix(sortrows(aputaulu, 2), nrow = nrow(aputaulu))
inds <- t(aputaulu[, 1])
inds <- aputaulu[, 1]
muutosNyt <- 0
for (ind in inds) {
@ -324,7 +322,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
if (round == 5) {
aputaulu <- c(inds, rand(length(inds), 1))
aputaulu <- sortrows(aputaulu, 2)
inds <- t(aputaulu[, 1])
inds <- aputaulu[, 1]
} else if (round == 6) {
inds <- returnInOrder(
inds, pop, rows, data, adjprior, priorTerm
@ -525,15 +523,15 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
if (ready == 0) {
if (vaihe == 1) {
roundTypes <- 1
roundTypes <- t(1)
} else if (vaihe == 2) {
roundTypes <- c(2, 1)
roundTypes <- t(c(2, 1))
} else if (vaihe == 3) {
roundTypes <- c(5, 5, 7)
roundTypes <- t(c(5, 5, 7))
} else if (vaihe == 4) {
roundTypes <- c(4, 3, 1)
roundTypes <- t(c(4, 3, 1))
} else if (vaihe == 5) {
roundTypes <- c(6, 7, 2, 3, 4, 1)
roundTypes <- t(c(6, 7, 2, 3, 4, 1))
}
}
}

View file

@ -7,8 +7,8 @@ initialCounts <- function(partition, data, npops, rows, noalle, adjprior) {
counts <- zeros(base::max(noalle), nloci, npops)
sumcounts <- zeros(npops, nloci)
for (i in 1:npops) {
for (j in 1:nloci) {
for (i in seq_len(npops)) {
for (j in seq_len(nloci)) {
havainnotLokuksessa <- matlab2r::find(partition == i & data[, j] >= 0)
sumcounts[i, j] <- length(havainnotLokuksessa)
for (k in 1:noalle[j]) {

View file

@ -26,11 +26,11 @@ linkage <- function(Y, method = "co") {
monotonic <- 1
Z <- zeros(m - 1, 3) # allocate the output matrix.
N <- zeros(1, 2 * m - 1)
N[1:m] <- 1
N[seq_len(m)] <- 1
n <- m # since m is changing, we need to save m in n.
R <- 1:n
R <- seq_len(n)
for (s in 1:(n - 1)) {
X <- as.matrix(as.vector(Y), ncol = 1)
X <- as.matrix(as.vector(Y), nrow = 1)
v <- matlab2r::min(X)$mins
k <- matlab2r::min(X)$idx
@ -83,11 +83,14 @@ linkage <- function(Y, method = "co") {
)
J <- c(J, i * (m - (i + 1) / 2) - m + j)
Y <- Y[-J] # no need for the cluster information about j
# update m, N, R
m <- m - 1
N[n + s] <- N[R[i]] + N[R[j]]
R[i] <- n + s
R[j:(n - 1)] <- R[(j + 1):n]
if (j < n) {
R[j:(n - 1)] <- R[(j + 1):n]
}
}
return(Z)
}

View file

@ -1,7 +1,6 @@
#' @title Logml to string
#' @description Returns a string representation of a logml
#' @param logml input Logml
#' @param
#' @param leading_zeros_replacement string to replace leading zeros with
#' @return String version of logml
logml2String <- function(logml, leading_zeros_replacement = " ") {

View file

@ -35,7 +35,7 @@ newGetDistances <- function(data, rowsFromInd) {
y <- zeros(size(toka))
y <- apply(y, 2, as.integer)
for (j in 1:nloci) {
for (j in seq_len(nloci)) {
for (k in 1:rowsFromInd) {
x[, k] <- data[eka[, k], j]
y[, k] <- data[toka[, k], j]

37
R/process_BAPS_data.R Normal file
View file

@ -0,0 +1,37 @@
process_BAPS_data <- function(file, partitionCompare) {
if (!is.null(partitionCompare)) {
cat('Data:', file, '\n')
}
data <- read.table(file)
ninds <- testaaOnkoKunnollinenBapsData(data) # for testing purposes?
if (ninds == 0) {
warning('Incorrect Data-file.')
return(NULL)
}
popnames <- NULL # Dropped specification of population names (from BAPS 6)
result <- handleData(data, format = "BAPS")
data <- result$newData
rowsFromInd <- result$rowsFromInd
alleleCodes <- result$alleleCodes
noalle <- result$noalle
adjprior <- result$adjprior
priorTerm <- result$priorTerm
result <- newGetDistances(data, rowsFromInd)
Z <- result$Z
dist <- result$dist
# Forming and saving pre-processed data
processed_data <- list(
data = data,
rowsFromInd = rowsFromInd,
alleleCodes = alleleCodes,
noalle = noalle,
adjprior = adjprior,
priorTerm = priorTerm,
dist = dist,
popnames = popnames,
Z = Z
)
return(processed_data)
}

View file

@ -1,6 +1,5 @@
#' @title Bayesian Analysis of Population Structure
#' @description This is a partial implementation of the BAPS software
#' @docType package
#' @name rBAPS
#' @note Found a bug? Want to suggest a feature? Contribute to the scientific
#' and open source communities by opening an issue on our home page.
@ -11,4 +10,5 @@
#' @importFrom stats runif
#' @importFrom zeallot %<-%
#' @importFrom matlab2r nargin log2
NULL
#' @importFrom utils read.table
"_PACKAGE"

View file

@ -8,7 +8,6 @@ greedyMix(
data,
format = gsub("^.*\\\\.", "", data),
partitionCompare = NULL,
ninds = 1L,
npops = 1L,
counts = NULL,
sumcounts = NULL,
@ -27,8 +26,6 @@ greedyMix(
\item{partitionCompare}{a list of partitions to compare}
\item{ninds}{number of individuals}
\item{npops}{number of populations}
\item{counts}{counts}
@ -51,8 +48,10 @@ greedyMix(
Clustering of individuals
}
\examples{
data <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS")
greedyMix(data, "fasta")
\dontrun{ # TEMP: unwrap once #24 is resolved
data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS")
greedyMix(data, "baps")
} # TEMP: unwrap once #24 is resolved
}
\references{
Samtools: a suite of programs for interacting

View file

@ -2,6 +2,7 @@
% Please edit documentation in R/rBAPS-package.R
\docType{package}
\name{rBAPS}
\alias{rBAPS-package}
\alias{rBAPS}
\title{Bayesian Analysis of Population Structure}
\description{
@ -12,3 +13,20 @@ Found a bug? Want to suggest a feature? Contribute to the scientific
and open source communities by opening an issue on our home page.
Check the "BugReports" field on the package description for the URL.
}
\seealso{
Useful links:
\itemize{
\item Report bugs at \url{https://github.com/ocbe-uio/rBAPS/issues}
}
}
\author{
\strong{Maintainer}: Waldir Leoncio \email{w.l.netto@medisin.uio.no}
Authors:
\itemize{
\item Jukka Corander \email{jukka.corander@medisin.uio.no}
\item Gerry Tonkin-Hill \email{jukka.corander@medisin.uio.no}
}
}

View file

@ -57,12 +57,15 @@ test_that("Files are imported correctly", {
)
)
expect_equal(length(raw_bam[[1]]), 13)
expect_error(
greedyMix(
data = file.path(path_inst, "FASTA_clustering_haploid.fasta"),
format = "FASTA"
),
"FASTA format not yet supported on greedyMix"
)
})
df_fasta <- greedyMix(
data = file.path(path_inst, "FASTA_clustering_haploid.fasta"),
format = "FASTA"
)
test_that("greedyMix() works", {
expect_error(greedyMix(file.path(path_inst, "vcf_example.vcf")))
expect_error(greedyMix(file.path(path_inst, "bam_example.bam")))