Merge branch 'baps-format' into develop
* baps-format: Increment version number to 0.0.0.9025 Skips `greedyMix()` example until #24 is fixed Updated documentation (#24) Fails for FAST input to `greedyMix()` (#24) Fixed calculation og `ninds` (#24) Bugfixes to `indMix()` and subfunctions (#24) Improved handling of BAPS format (#24) Fixed calculation of `ninds` `ekat` (#24) Added processing of BAPS format
This commit is contained in:
commit
9b8da3ab4d
14 changed files with 134 additions and 57 deletions
|
|
@ -1,6 +1,6 @@
|
|||
Package: rBAPS
|
||||
Title: Bayesian Analysis of Population Structure
|
||||
Version: 0.0.0.9024
|
||||
Version: 0.0.0.9025
|
||||
Date: 2020-11-09
|
||||
Authors@R:
|
||||
c(
|
||||
|
|
@ -36,7 +36,7 @@ Description: Partial R implementation of the BAPS software
|
|||
License: GPL-3
|
||||
BugReports: https://github.com/ocbe-uio/rBAPS/issues
|
||||
Encoding: UTF-8
|
||||
RoxygenNote: 7.2.3
|
||||
RoxygenNote: 7.3.1
|
||||
Suggests:
|
||||
testthat (>= 2.1.0)
|
||||
Imports:
|
||||
|
|
|
|||
|
|
@ -38,5 +38,6 @@ importFrom(methods,is)
|
|||
importFrom(stats,runif)
|
||||
importFrom(stats,sd)
|
||||
importFrom(utils,read.delim)
|
||||
importFrom(utils,read.table)
|
||||
importFrom(vcfR,read.vcfR)
|
||||
importFrom(zeallot,"%<-%")
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@
|
|||
#' @param data data file
|
||||
#' @param format Data format. Format supported: "FASTA", "VCF" ,"BAM", "GenePop"
|
||||
#' @param partitionCompare a list of partitions to compare
|
||||
#' @param ninds number of individuals
|
||||
#' @param npops number of populations
|
||||
#' @param counts counts
|
||||
#' @param sumcounts sumcounts
|
||||
|
|
@ -20,25 +19,46 @@
|
|||
#' with high-throughput sequencing data. <http://www.htslib.org/>
|
||||
#' @export
|
||||
#' @examples
|
||||
#' data <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS")
|
||||
#' greedyMix(data, "fasta")
|
||||
#' \dontrun{ # TEMP: unwrap once #24 is resolved
|
||||
#' data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS")
|
||||
#' greedyMix(data, "baps")
|
||||
#' } # TEMP: unwrap once #24 is resolved
|
||||
greedyMix <- function(
|
||||
data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, ninds = 1L, npops = 1L,
|
||||
data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, npops = 1L,
|
||||
counts = NULL, sumcounts = NULL, max_iter = 100L, alleleCodes = NULL,
|
||||
inp = NULL, popnames = NULL, fixedK = FALSE, verbose = FALSE
|
||||
) {
|
||||
# Importing and handling data ================================================
|
||||
data <- importFile(data, format, verbose)
|
||||
data <- handleData(data, tolower(format))
|
||||
c <- list(
|
||||
noalle = data[["noalle"]],
|
||||
data = data[["newData"]],
|
||||
adjprior = data[["adjprior"]],
|
||||
priorTerm = data[["priorTerm"]],
|
||||
rowsFromInd = data[["rowsFromInd"]]
|
||||
)
|
||||
if (tolower(format) %in% "fasta") {
|
||||
stop("FASTA format not yet supported on greedyMix")
|
||||
}
|
||||
if (tolower(format) %in% "baps") {
|
||||
data <- process_BAPS_data(data, NULL)
|
||||
c <- list(
|
||||
noalle = data[["noalle"]],
|
||||
data = data[["data"]],
|
||||
adjprior = data[["adjprior"]],
|
||||
priorTerm = data[["priorTerm"]],
|
||||
rowsFromInd = data[["rowsFromInd"]],
|
||||
Z = data[["Z"]],
|
||||
dist = data[["dist"]]
|
||||
)
|
||||
} else {
|
||||
data <- importFile(data, format, verbose)
|
||||
data <- handleData(data, tolower(format))
|
||||
c <- list(
|
||||
noalle = data[["noalle"]],
|
||||
data = data[["newData"]],
|
||||
adjprior = data[["adjprior"]],
|
||||
priorTerm = data[["priorTerm"]],
|
||||
rowsFromInd = data[["rowsFromInd"]],
|
||||
Z = data[["Z"]],
|
||||
dist = data[["dist"]]
|
||||
)
|
||||
}
|
||||
|
||||
# Comparing partitions =======================================================
|
||||
ninds <- length(unique(c[["data"]][, ncol(c[["data"]])]))
|
||||
if (!is.null(partitionCompare)) {
|
||||
logmls <- comparePartitions(
|
||||
c[["data"]], nrow(c[["data"]]), partitionCompare[["partitions"]], ninds,
|
||||
|
|
@ -46,10 +66,9 @@ greedyMix <- function(
|
|||
)
|
||||
}
|
||||
|
||||
|
||||
# Generating partition summary ===============================================
|
||||
ekat <- seq(1L, c[["rowsFromInd"]], ninds * c[["rowsFromInd"]]) # ekat = (1:rowsFromInd:ninds*rowsFromInd)';
|
||||
c[["rows"]] <- c(ekat, ekat + c[["rowsFromInd"]] - 1L) # c.rows = [ekat ekat+rowsFromInd-1]
|
||||
ekat <- seq(1L, ninds * c[["rowsFromInd"]], c[["rowsFromInd"]])
|
||||
c[["rows"]] <- cbind(ekat, ekat + c[["rowsFromInd"]] - 1L)
|
||||
logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose)
|
||||
logml <- logml_npops_partitionSummary[["logml"]]
|
||||
npops <- logml_npops_partitionSummary[["npops"]]
|
||||
|
|
|
|||
|
|
@ -30,17 +30,17 @@ handleData <- function(raw_data, format = "Genepop") {
|
|||
"bam" = stop("BAM format not supported for processing yet")
|
||||
)
|
||||
data <- as.matrix(raw_data)
|
||||
dataApu <- data[, 1:nloci]
|
||||
dataApu <- data[, seq_len(nloci)]
|
||||
nollat <- matlab2r::find(dataApu == 0)
|
||||
if (!isempty(nollat)) {
|
||||
isoinAlleeli <- base::max(base::max(dataApu))
|
||||
dataApu[nollat] <- isoinAlleeli + 1
|
||||
data[, 1:nloci] <- dataApu
|
||||
data[, seq_len(nloci)] <- dataApu
|
||||
}
|
||||
|
||||
noalle <- zeros(1, nloci)
|
||||
alleelitLokuksessa <- cell(nloci, 1, expandable = TRUE)
|
||||
for (i in 1:nloci) {
|
||||
for (i in seq_len(nloci)) {
|
||||
alleelitLokuksessaI <- unique(data[, i])
|
||||
alleelitLokuksessa[[i]] <- sort(alleelitLokuksessaI[
|
||||
matlab2r::find(alleelitLokuksessaI >= 0)
|
||||
|
|
@ -48,7 +48,7 @@ handleData <- function(raw_data, format = "Genepop") {
|
|||
noalle[i] <- length(alleelitLokuksessa[[i]])
|
||||
}
|
||||
alleleCodes <- zeros(base::max(noalle), nloci)
|
||||
for (i in 1:nloci) {
|
||||
for (i in seq_len(nloci)) {
|
||||
alleelitLokuksessaI <- alleelitLokuksessa[[i]]
|
||||
puuttuvia <- base::max(noalle) - length(alleelitLokuksessaI)
|
||||
alleleCodes[, i] <- as.matrix(c(alleelitLokuksessaI, zeros(puuttuvia, 1)))
|
||||
|
|
@ -83,7 +83,7 @@ handleData <- function(raw_data, format = "Genepop") {
|
|||
|
||||
adjprior <- zeros(base::max(noalle), nloci)
|
||||
priorTerm <- 0
|
||||
for (j in 1:nloci) {
|
||||
for (j in seq_len(nloci)) {
|
||||
adjprior[, j] <- as.matrix(c(
|
||||
repmat(1 / noalle[j], c(noalle[j], 1)),
|
||||
ones(base::max(noalle) - noalle[j], 1)
|
||||
|
|
|
|||
28
R/indMix.R
28
R/indMix.R
|
|
@ -21,11 +21,9 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
|
|||
Z <- NULL
|
||||
}
|
||||
|
||||
|
||||
rm(c)
|
||||
nargin <- length(as.list(match.call())) - 1
|
||||
if (nargin < 2) {
|
||||
dispText <- 1
|
||||
if (missing(npops)) {
|
||||
# Recreate npopsTaulu from objects other than npops
|
||||
dispText <- TRUE
|
||||
npopstext <- matrix()
|
||||
ready <- FALSE
|
||||
teksti <- "Input upper bound to the number of populations (possibly multiple values)"
|
||||
|
|
@ -75,7 +73,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
|
|||
partitionSummary[, 1] <- zeros(30, 1)
|
||||
worstLogml <- -1e50
|
||||
worstIndex <- 1
|
||||
for (run in seq_along(nruns)) {
|
||||
for (run in seq_len(nruns)) {
|
||||
npops <- npopsTaulu[[run]]
|
||||
if (dispText) {
|
||||
dispLine()
|
||||
|
|
@ -110,7 +108,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
|
|||
# PARHAAN MIXTURE-PARTITION ETSIMINEN
|
||||
nRoundTypes <- 7
|
||||
kokeiltu <- zeros(nRoundTypes, 1)
|
||||
roundTypes <- c(1, 1) # Ykk<6B>svaiheen sykli kahteen kertaan.
|
||||
roundTypes <- t(c(1, 1)) # Ykk<6B>svaiheen sykli kahteen kertaan.
|
||||
ready <- 0
|
||||
vaihe <- 1
|
||||
|
||||
|
|
@ -143,9 +141,9 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
|
|||
if (kokeiltu[round] == 1) { # Askelta kokeiltu viime muutoksen j<>lkeen
|
||||
} else if (round == 0 | round == 1) { # Yksil<69>n siirt<72>minen toiseen populaatioon.
|
||||
inds <- seq_len(ninds)
|
||||
aputaulu <- cbind(t(inds), rand(ninds, 1))
|
||||
aputaulu <- cbind(inds, rand(ninds, 1))
|
||||
aputaulu <- matrix(sortrows(aputaulu, 2), nrow = nrow(aputaulu))
|
||||
inds <- t(aputaulu[, 1])
|
||||
inds <- aputaulu[, 1]
|
||||
muutosNyt <- 0
|
||||
|
||||
for (ind in inds) {
|
||||
|
|
@ -324,7 +322,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
|
|||
if (round == 5) {
|
||||
aputaulu <- c(inds, rand(length(inds), 1))
|
||||
aputaulu <- sortrows(aputaulu, 2)
|
||||
inds <- t(aputaulu[, 1])
|
||||
inds <- aputaulu[, 1]
|
||||
} else if (round == 6) {
|
||||
inds <- returnInOrder(
|
||||
inds, pop, rows, data, adjprior, priorTerm
|
||||
|
|
@ -525,15 +523,15 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
|
|||
|
||||
if (ready == 0) {
|
||||
if (vaihe == 1) {
|
||||
roundTypes <- 1
|
||||
roundTypes <- t(1)
|
||||
} else if (vaihe == 2) {
|
||||
roundTypes <- c(2, 1)
|
||||
roundTypes <- t(c(2, 1))
|
||||
} else if (vaihe == 3) {
|
||||
roundTypes <- c(5, 5, 7)
|
||||
roundTypes <- t(c(5, 5, 7))
|
||||
} else if (vaihe == 4) {
|
||||
roundTypes <- c(4, 3, 1)
|
||||
roundTypes <- t(c(4, 3, 1))
|
||||
} else if (vaihe == 5) {
|
||||
roundTypes <- c(6, 7, 2, 3, 4, 1)
|
||||
roundTypes <- t(c(6, 7, 2, 3, 4, 1))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,8 +7,8 @@ initialCounts <- function(partition, data, npops, rows, noalle, adjprior) {
|
|||
|
||||
counts <- zeros(base::max(noalle), nloci, npops)
|
||||
sumcounts <- zeros(npops, nloci)
|
||||
for (i in 1:npops) {
|
||||
for (j in 1:nloci) {
|
||||
for (i in seq_len(npops)) {
|
||||
for (j in seq_len(nloci)) {
|
||||
havainnotLokuksessa <- matlab2r::find(partition == i & data[, j] >= 0)
|
||||
sumcounts[i, j] <- length(havainnotLokuksessa)
|
||||
for (k in 1:noalle[j]) {
|
||||
|
|
|
|||
11
R/linkage.R
11
R/linkage.R
|
|
@ -26,11 +26,11 @@ linkage <- function(Y, method = "co") {
|
|||
monotonic <- 1
|
||||
Z <- zeros(m - 1, 3) # allocate the output matrix.
|
||||
N <- zeros(1, 2 * m - 1)
|
||||
N[1:m] <- 1
|
||||
N[seq_len(m)] <- 1
|
||||
n <- m # since m is changing, we need to save m in n.
|
||||
R <- 1:n
|
||||
R <- seq_len(n)
|
||||
for (s in 1:(n - 1)) {
|
||||
X <- as.matrix(as.vector(Y), ncol = 1)
|
||||
X <- as.matrix(as.vector(Y), nrow = 1)
|
||||
v <- matlab2r::min(X)$mins
|
||||
k <- matlab2r::min(X)$idx
|
||||
|
||||
|
|
@ -83,11 +83,14 @@ linkage <- function(Y, method = "co") {
|
|||
)
|
||||
J <- c(J, i * (m - (i + 1) / 2) - m + j)
|
||||
Y <- Y[-J] # no need for the cluster information about j
|
||||
|
||||
# update m, N, R
|
||||
m <- m - 1
|
||||
N[n + s] <- N[R[i]] + N[R[j]]
|
||||
R[i] <- n + s
|
||||
R[j:(n - 1)] <- R[(j + 1):n]
|
||||
if (j < n) {
|
||||
R[j:(n - 1)] <- R[(j + 1):n]
|
||||
}
|
||||
}
|
||||
return(Z)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
#' @title Logml to string
|
||||
#' @description Returns a string representation of a logml
|
||||
#' @param logml input Logml
|
||||
#' @param
|
||||
#' @param leading_zeros_replacement string to replace leading zeros with
|
||||
#' @return String version of logml
|
||||
logml2String <- function(logml, leading_zeros_replacement = " ") {
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ newGetDistances <- function(data, rowsFromInd) {
|
|||
y <- zeros(size(toka))
|
||||
y <- apply(y, 2, as.integer)
|
||||
|
||||
for (j in 1:nloci) {
|
||||
for (j in seq_len(nloci)) {
|
||||
for (k in 1:rowsFromInd) {
|
||||
x[, k] <- data[eka[, k], j]
|
||||
y[, k] <- data[toka[, k], j]
|
||||
|
|
|
|||
37
R/process_BAPS_data.R
Normal file
37
R/process_BAPS_data.R
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
process_BAPS_data <- function(file, partitionCompare) {
|
||||
if (!is.null(partitionCompare)) {
|
||||
cat('Data:', file, '\n')
|
||||
}
|
||||
data <- read.table(file)
|
||||
ninds <- testaaOnkoKunnollinenBapsData(data) # for testing purposes?
|
||||
if (ninds == 0) {
|
||||
warning('Incorrect Data-file.')
|
||||
return(NULL)
|
||||
}
|
||||
popnames <- NULL # Dropped specification of population names (from BAPS 6)
|
||||
|
||||
result <- handleData(data, format = "BAPS")
|
||||
data <- result$newData
|
||||
rowsFromInd <- result$rowsFromInd
|
||||
alleleCodes <- result$alleleCodes
|
||||
noalle <- result$noalle
|
||||
adjprior <- result$adjprior
|
||||
priorTerm <- result$priorTerm
|
||||
result <- newGetDistances(data, rowsFromInd)
|
||||
Z <- result$Z
|
||||
dist <- result$dist
|
||||
|
||||
# Forming and saving pre-processed data
|
||||
processed_data <- list(
|
||||
data = data,
|
||||
rowsFromInd = rowsFromInd,
|
||||
alleleCodes = alleleCodes,
|
||||
noalle = noalle,
|
||||
adjprior = adjprior,
|
||||
priorTerm = priorTerm,
|
||||
dist = dist,
|
||||
popnames = popnames,
|
||||
Z = Z
|
||||
)
|
||||
return(processed_data)
|
||||
}
|
||||
|
|
@ -1,6 +1,5 @@
|
|||
#' @title Bayesian Analysis of Population Structure
|
||||
#' @description This is a partial implementation of the BAPS software
|
||||
#' @docType package
|
||||
#' @name rBAPS
|
||||
#' @note Found a bug? Want to suggest a feature? Contribute to the scientific
|
||||
#' and open source communities by opening an issue on our home page.
|
||||
|
|
@ -11,4 +10,5 @@
|
|||
#' @importFrom stats runif
|
||||
#' @importFrom zeallot %<-%
|
||||
#' @importFrom matlab2r nargin log2
|
||||
NULL
|
||||
#' @importFrom utils read.table
|
||||
"_PACKAGE"
|
||||
|
|
|
|||
|
|
@ -8,7 +8,6 @@ greedyMix(
|
|||
data,
|
||||
format = gsub("^.*\\\\.", "", data),
|
||||
partitionCompare = NULL,
|
||||
ninds = 1L,
|
||||
npops = 1L,
|
||||
counts = NULL,
|
||||
sumcounts = NULL,
|
||||
|
|
@ -27,8 +26,6 @@ greedyMix(
|
|||
|
||||
\item{partitionCompare}{a list of partitions to compare}
|
||||
|
||||
\item{ninds}{number of individuals}
|
||||
|
||||
\item{npops}{number of populations}
|
||||
|
||||
\item{counts}{counts}
|
||||
|
|
@ -51,8 +48,10 @@ greedyMix(
|
|||
Clustering of individuals
|
||||
}
|
||||
\examples{
|
||||
data <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS")
|
||||
greedyMix(data, "fasta")
|
||||
\dontrun{ # TEMP: unwrap once #24 is resolved
|
||||
data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS")
|
||||
greedyMix(data, "baps")
|
||||
} # TEMP: unwrap once #24 is resolved
|
||||
}
|
||||
\references{
|
||||
Samtools: a suite of programs for interacting
|
||||
|
|
|
|||
18
man/rBAPS.Rd
18
man/rBAPS.Rd
|
|
@ -2,6 +2,7 @@
|
|||
% Please edit documentation in R/rBAPS-package.R
|
||||
\docType{package}
|
||||
\name{rBAPS}
|
||||
\alias{rBAPS-package}
|
||||
\alias{rBAPS}
|
||||
\title{Bayesian Analysis of Population Structure}
|
||||
\description{
|
||||
|
|
@ -12,3 +13,20 @@ Found a bug? Want to suggest a feature? Contribute to the scientific
|
|||
and open source communities by opening an issue on our home page.
|
||||
Check the "BugReports" field on the package description for the URL.
|
||||
}
|
||||
\seealso{
|
||||
Useful links:
|
||||
\itemize{
|
||||
\item Report bugs at \url{https://github.com/ocbe-uio/rBAPS/issues}
|
||||
}
|
||||
|
||||
}
|
||||
\author{
|
||||
\strong{Maintainer}: Waldir Leoncio \email{w.l.netto@medisin.uio.no}
|
||||
|
||||
Authors:
|
||||
\itemize{
|
||||
\item Jukka Corander \email{jukka.corander@medisin.uio.no}
|
||||
\item Gerry Tonkin-Hill \email{jukka.corander@medisin.uio.no}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -57,12 +57,15 @@ test_that("Files are imported correctly", {
|
|||
)
|
||||
)
|
||||
expect_equal(length(raw_bam[[1]]), 13)
|
||||
expect_error(
|
||||
greedyMix(
|
||||
data = file.path(path_inst, "FASTA_clustering_haploid.fasta"),
|
||||
format = "FASTA"
|
||||
),
|
||||
"FASTA format not yet supported on greedyMix"
|
||||
)
|
||||
})
|
||||
|
||||
df_fasta <- greedyMix(
|
||||
data = file.path(path_inst, "FASTA_clustering_haploid.fasta"),
|
||||
format = "FASTA"
|
||||
)
|
||||
test_that("greedyMix() works", {
|
||||
expect_error(greedyMix(file.path(path_inst, "vcf_example.vcf")))
|
||||
expect_error(greedyMix(file.path(path_inst, "bam_example.bam")))
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue